# 1. Collecting data set and Importing necessary libraries 

#### 1.1 Import necessary libraries

In [147]:
#importing all the libraries
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import string
import numpy as np
import networkx as nx
import pandas as pd

import scipy as sp

In [148]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('popular')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\aksha\AppData\Roaming\nltk

True

#### 1.2 Collecting the data

In [149]:
#Loading the web graph
G = nx.read_gpickle("web_graph.gpickle")
adj = nx.to_numpy_array(G)
adj_tran = adj.T
np.array(list(G.nodes))

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [150]:
#Reading input text files
text = []
for x in range(len(G.nodes)):
    text.append(G.nodes[x]['page_content'])
text[15]

'Business: Mills Grabs \\$1B Portfolio; Taubman Likely to Lose Contracts Mills Corp. agreed to purchase a 50 percent interest in nine malls owned by General Motors Asset Management Corp. for just over \\$1 billion, creating a new joint venture between the groups. The deal will extend ...'

# 2. Removal of punctuation and stop words

In [151]:
# remove punctuations
# tokenise the document
def tokenize(sentence):
    words = nltk.word_tokenize(sentence)
    token_words= [word for word in words if word.isalnum()]     #takes only the charecters which are either numbers or alphabets
    return token_words

In [152]:
# remove stop words from tokens
stopwords = stopwords.words('english')
def stopwords_clr(sentence):
    tokens_clr= [token for token in sentence if token.lower() not in stopwords] #takes only the words which are not in stopwords
    return tokens_clr

# 3. Normalization using Porter Stemmer

In [153]:
#stemming the words to root form
stem = PorterStemmer()

def stem_tokens(sentence):
    tokens_stem = []
    for token in sentence:
        tokens_stem.append(stem.stem(token))     #stems the token and appends into tokens_stem list
    return tokens_stem

# 4. Preprocessing data

In [154]:
#tokenizes 'cont', removes stopwords and stems 'cont' 
def preprocess(cont):
    return " ".join(stopwords_clr(stem_tokens(stopwords_clr(tokenize(cont)))))      

In [155]:
processed_data = []    #This contains the pre-processed data of each document

In [156]:
for i in range(len(text)):                       #goes through all the documents in text list
  processed_data.append(preprocess(text[i]))     #appends the preprocessed document to preprocessed_data list 

# 5. Construct inverted index

In [157]:
inv_index = {}      #creating inverted index

In [158]:
#Indexing the inputted document
def indexing(document, index):
    words = nltk.word_tokenize(document)          #tokenizes the document
    for word in words:                          
        if(inv_index.get(word) is None):          #check whether word is there in inv_index or not
            inv_index[word] = [index]               
        elif not index in inv_index.get(word):     
            inv_index.get(word).append(index)     

In [159]:
for x in range(len(processed_data)):
    indexing(processed_data[x], x)          #indexing the preprocessd data of documents

In [160]:
keys = list(inv_index.keys())       #Keys contains a list of all terms in the dictionary
postings = list(inv_index.values())    #Postings contain the posting list of all terms in the dictionary

# 7. Boolean query

In [161]:
def gen_posting(term, inv_index):
    if term not in inv_index.keys():
        posting = []
    else:
        posting = inv_index[term]
    return posting

In [162]:
#Processing a boolean query and finding the appropriate documents
def boolean_query(query, inv_index):
  terms = query.split(' ')
  bool_words = []
  diff_words = []

  stem = PorterStemmer()

  for term in terms:
    if term.lower() != 'and' and term.lower() != 'or' and term.lower() != 'not':
      diff_words.append(stem.stem(term))
    else:
      bool_words.append(term)
  
  #print(bool_words, diff_words)
  
  posting_term = []
  posting_comb = []

  for term in diff_words:
    posting_term = gen_posting(term, inv_index)
    posting_comb.append(posting_term)

  #print(posting_comb)


  i = 0
  x = 0
  z = len(bool_words)
    
  while i < z:
    
    if bool_words[x] == 'not':
      all_docs = set(list(range(len(processed_data))))
      res = list(all_docs - set(posting_comb[x]))
      posting_comb.remove(posting_comb[x])
      posting_comb.insert(x, res)
      bool_words.remove(bool_words[x])
      i = i + 1
    
    elif bool_words[x] == 'and':
        if (x + 1) < len(bool_words) and bool_words[x + 1] == 'not':
            all_docs = set(list(range(len(processed_data))))
            res = list(all_docs - set(posting_comb[x + 1]))
            bool_words.remove(bool_words[x + 1])
            i = i + 1
        else:
            res = posting_comb[x + 1]
        intersection = list(set(posting_comb[x]).intersection(res))
        posting_comb.remove(posting_comb[x])
        posting_comb.remove(posting_comb[x])
        posting_comb.insert(x, intersection)
        bool_words.remove(bool_words[x])
        i = i + 1
        
    elif bool_words[x] == 'or':
        x = x + 1
        i = i + 1
        
  #print(posting_comb)
  #print(bool_words)
    
  i = 0      
  while i < len(bool_words):
    union = posting_comb[0] + list(set(posting_comb[1]) - set(posting_comb[0]))
    #print(union)
    posting_comb.remove(posting_comb[0])
    posting_comb.remove(posting_comb[0])
    posting_comb.insert(0, union)
    i = i + 1
         
      
  #print(posting_comb)
  return posting_comb[0]

In [163]:
def gen_query(string):
    inp_query = ''
    
    lst = stopwords_clr(tokenize(string))

    index = 0
    for x in range(2 * len(lst)):
        if x % 2 == 0:
            inp_query += lst[index]
            index += 1
        elif x < 2 * len(lst) - 1:
            inp_query += ' and '
            
    return inp_query

In [164]:
string = input()

inp_query = gen_query(string)

inp_query

gunfire


'gunfire'

In [165]:
out_doc = []
out_doc = boolean_query(inp_query, inv_index)

# 8. Generating Base Set

In [166]:
def gen_root_bin(out_doc, G):
    root = np.array(out_doc)
    root_bin = []
    for x in range(len(G.nodes)):
        if x in root:
            root_bin.append(1)
        else:
            root_bin.append(0)
            
    root_bin = np.array(root_bin)
    return root_bin

In [167]:
root_bin = gen_root_bin(out_doc, G)
root_bin

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [168]:
def gen_base(adj, adj_tran, out_doc, G):
    root_bin = gen_root_bin(out_doc, G)
    
    set1 = np.dot(root_bin, adj)
    set2 = np.dot(root_bin, adj_tran)
    base_bin = set1 + set2 + root_bin
    
    base = []
    for x in range(len(G.nodes)):
        if base_bin[x] > 0:
            base.append(x)
            
    base = np.array(base)
    return base

In [169]:
base = gen_base(adj, adj_tran, out_doc, G)
base

array([ 8, 15, 66, 77])

In [170]:
def gen_subgraph(G, base):
    SG = nx.subgraph(G, base)
    A = nx.to_numpy_array(SG)
    sub_nodes = np.array(list(SG.nodes))
    return A, sub_nodes, SG

In [171]:
A, sub_nodes, SG = gen_subgraph(G, base)
print(sub_nodes)
print()
print(A)

[ 8 66 77 15]

[[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [1. 1. 0. 1.]
 [1. 1. 1. 0.]]


# 9. Principle Eigenvector Method

In [172]:
h_mat = np.dot(A, A.T)
a_mat = np.dot(A.T, A)

In [173]:
def gen_scores(K):
    v, V = np.linalg.eig(K)
    if len(v) == 0:
        eig = []
    else:
        eig = V[:, v.argmax()]
        eig = eig / eig.sum()
        eig = eig.real
        return eig

In [174]:
eig_h = gen_scores(h_mat)

if isinstance(eig_h, type(None)):
    print('Required pages do not exist')

else:
    h_map = {}
    for x in range(len(sub_nodes)):
        h_map[sub_nodes[x]] = round(eig_h[x], 3)

    h_map_sorted = sorted(h_map.items(), key = lambda kv:(kv[1], kv[0]), reverse = True)

    print('Top 3 Hub pages and Scores:')

    count = 0
    if len(h_map) >= 3:
        for x in range(3):
            print(h_map_sorted[x])
            if x == 2:
                count = x + 1
                while h_map_sorted[count][1] == h_map_sorted[x][1]:
                    print(h_map_sorted[count])
                    count += 1
                    if count >= len(h_map):
                        break

    else:
        for x in range(len(h_map)):
            print(h_map_sorted[x])

Top 3 Hub pages and Scores:
(15, 0.385)
(77, 0.366)
(8, 0.165)


In [175]:
eig_a = gen_scores(a_mat)

if isinstance(eig_a, type(None)):
    print('Required pages do not exist')
    
else:
    a_map = {}
    for x in range(len(sub_nodes)):
        a_map[sub_nodes[x]] = round(eig_a[x], 3)

    a_map_sorted = sorted(a_map.items(), key = lambda kv:(kv[1], kv[0]), reverse = True)

    print('Top 3 Authority pages and Scores:')

    count = 0
    if len(a_map) > 3:
        for x in range(3):
            print(a_map_sorted[x])
            if x == 2:
                count = x + 1
                while a_map_sorted[count][1] == a_map_sorted[x][1]:
                    print(a_map_sorted[count])
                    count += 1
                    if count >= len(a_map):
                        break

    else:
        for x in range(len(a_map)):
            print(a_map_sorted[x])

Top 3 Authority pages and Scores:
(66, 0.366)
(8, 0.3)
(77, 0.188)
