In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

df = pd.read_csv('data.csv')

#PREPROCESSING
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if isinstance(text, str):  #Check if the text is a string
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token.strip() and token not in stop_words]
        tokens = [token for token in tokens if token.strip()]
        return tokens
    else:
        return []  #Return an empty list if the text is not a string

df['SECTION_TEXT'] = df['SECTION_TEXT'].apply(lambda x: preprocess_text(x))

#empty dictionary to store unique words and their term frequencies
word_to_index = {}  

#VOCABULARY
#calculate term frequency for each word
for idx, tokens_list in enumerate(df['SECTION_TEXT']):
    #dictionary to store term frequencies for this section text
    term_frequency = {}
    for word in tokens_list:
        if isinstance(word, str):
            #Increment the term frequency
            term_frequency[word] = term_frequency.get(word, 0) + 1
            #If the word is not already in word_to_index dictionary, add it with its index
            if word not in word_to_index:
                word_to_index[word] = len(word_to_index)  #Assign a unique index to each unique word

print("Vocabulary:")
for word, idx in sorted(word_to_index.items(), key=lambda x: x[1]):
    print(idx, word)
print("\n")

#TERM FREQUENCY
'''for idx, tokens_list in enumerate(df['SECTION_TEXT']):
    term_frequency = {}
    for word in tokens_list:
        term_frequency[word] = term_frequency.get(word, 0) + 1

print(f"Term frequencies for Section {df['ARTICLE_ID'][idx]}:")
for word, frequency in term_frequency.items():
    print(f"({word_to_index[word]}, {frequency})", end=", ")
print("\n")'''

for idx, tokens_list in enumerate(df['SECTION_TEXT']):
    term_frequency = {}
    for word in tokens_list:
        term_frequency[word] = term_frequency.get(word, 0) + 1

    print(f"Term frequencies for Section {df['ARTICLE_ID'][idx]}:")
    for word, frequency in term_frequency.items():
        print(f"({word_to_index[word]}, {frequency})", end=", ")
    print("\n")

#IDF or DOCUMENT FREQUENCY OF EACH WORD   
#Initialize a dictionary to store document frequency for each word
word_document_frequency = {}

#Loop through each word in the vocabulary
for word, index in word_to_index.items():
    #Initialize document frequency count for this word
    document_frequency = 0
    #Iterate through each section text
    for tokens_list in df['SECTION_TEXT']:
        #If the word appears, increment the document frequency count
        if word in tokens_list:
            document_frequency += 1
    #Store the document frequency count
    word_document_frequency[word] = document_frequency
    
print("Document Frequencies:")
for word, frequency in word_document_frequency.items():
    print(f"({word_to_index[word]}, {frequency})")

#TF/IDF WEIGHTS
#Calculate weights of each word
print("Weights for each word in the document:")
for idx, tokens_list in enumerate(df['SECTION_TEXT']):
    term_frequency = {}
    for word in tokens_list:
        term_frequency[word] = term_frequency.get(word, 0) + 1

    print(f"Weights for Section {df['ARTICLE_ID'][idx]}:")
    for word, frequency in term_frequency.items():
        word_index = word_to_index[word]
        document_frequency = word_document_frequency[word]
        #Calculate the weight by dividing term frequency by document frequency
        weight = frequency / document_frequency
        print(f"({word_index}, {weight:.5f})", end=", ")  
    print("\n")


#VECTOR SPACE MODEL

#VECTOR LIST

#Initialize a list to store weights for each section text
section_weights = []

#Iterate through each section text
for tokens_list in df['SECTION_TEXT'][:10]:  
    term_frequency = {}
    section_weight = [0] * len(word_to_index)  #Initialize weights list for this section
    
    for word in tokens_list:
        term_frequency[word] = term_frequency.get(word, 0) + 1
    
    #Calculate weights for each word
    for word, frequency in term_frequency.items():
        if word in word_to_index:  #Check if the word exists in the vocabulary
            word_index = word_to_index[word]
            document_frequency = word_document_frequency[word]
            weight = frequency / document_frequency
            weight = round(weight, 5)
            section_weight[word_index] = weight  #Update weight for this word in the section
    
    #Append the weights to the section_weights list
    section_weights.append(section_weight)

for idx, weights in enumerate(section_weights):
    print(f"\nWeights for Section {df['ARTICLE_ID'][idx]}:", [round(w, 5) for w in weights])


#QUERY LIST

# Initialize a query list with zeros
query_list = [0] * len(word_to_index)

#Define the sentence to search for
query_sentence = input("Enter a sentence: ")
query_words = preprocess_text(query_sentence)

#Initialize a list to store the relevance scores for each word in the query sentence
word_relevance_scores = []

#Iterate through each word in the query sentence
for query_word in query_words:
    #Check if the query word exists in the vocabulary
    if query_word in word_to_index:
        query_word_index = word_to_index[query_word]
        
        #Initialize a counter to keep track of the number of section texts containing the query term 
        document_counter = 0
        
        #Iterate through each document
        for tokens_list in df['SECTION_TEXT']:
            #Check if the query word exists in the document
            if query_word in tokens_list:
                document_counter += 1  #increment the document counter
                
                #Calculate term frequency for the query word
                term_frequency = tokens_list.count(query_word)
                
                #Calculate the weight of the query word
                document_frequency = word_document_frequency[query_word]
                weight = term_frequency / document_frequency
                
                #Update the corresponding index in the query list with the weight
                query_list[query_word_index] += weight
        
        #If the query word exists in at least one section text
        if document_counter > 0:
            # Calculate the average weight of the query word across all section texts
            average_weight = query_list[query_word_index] / document_counter
            #Update the query list at the index corresponding to the query word
            query_list[query_word_index] = average_weight
    
    word_relevance_scores.append(query_list[query_word_index])

print("\n\nQuery List:", query_list)

#Initialize a list to store the relevance scores for each section
section_relevance_scores = []

#Iterate through each section weight vector
for section_weight in section_weights:
    section_total_relevance_score = 0
    
    #Iterate through each word in the query sentence
    for query_word_index, word_relevance in enumerate(word_relevance_scores):
        #Multiply the weight of the word in the section_weight vector with the weight of the same word in the query list
        word_relevance = section_weight[query_word_index] * word_relevance
        
        #Add the relevance of this word to the total relevance score for this section
        section_total_relevance_score += word_relevance
    
    section_relevance_scores.append(section_total_relevance_score)

for idx, score in enumerate(section_relevance_scores):
    if score > 0:
        section_text = ' '.join(df['SECTION_TEXT'][idx])  
        print(f"\nRelevance score for Section {df['ARTICLE_ID'][idx]}:", score)
        print("Section text content:", section_text)



Vocabulary:
0 anarchism
1 political
2 philosophy
3 advocates
4 selfgoverned
5 societies
6 based
7 voluntary
8 institutions
9 often
10 described
11 stateless
12 although
13 several
14 authors
15 defined
16 specifically
17 nonhierarchical
18 free
19 associations
20 holds
21 state
22 undesirable
23 unnecessary
24 harmful
25 antistatism
26 central
27 entails
28 opposing
29 authority
30 hierarchical
31 organisation
32 conduct
33 human
34 relations
35 including
36 limited
37 system
38 usually
39 considered
40 extreme
41 leftwing
42 ideology
43 much
44 anarchist
45 economics
46 legal
47 reflects
48 antiauthoritarian
49 interpretations
50 communism
51 collectivism
52 syndicalism
53 mutualism
54 participatory
55 offer
56 fixed
57 body
58 doctrine
59 single
60 particular
61 world
62 view
63 instead
64 fluxing
65 flowing
66 many
67 types
68 traditions
69 exist
70 mutually
71 exclusive
72 schools
73 thought
74 differ
75 fundamentally
76 supporting
77 anything
78 individualism
79 complete
80 strain

Enter a sentence:  anarchism is a political based




Query List: [0.9173553719008265, 0.07852468768590126, 0, 0, 0, 0, 0.019064359113034092, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0