In [1]:
import os
from bs4 import BeautifulSoup
import codecs
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer
import re
import json

class index_genertor:
    def __init__(self,src_path,src_id):
        self.id=src_id
        self.src=src_path
        self.src_map=dict()
        self.count = 0
    def read(self,src_path):
        src_data=codecs.open(src_path, 'r',encoding="utf-8",errors='ignore')
        src_cont= src_data.read()
        src_data.close()
        return src_cont
    def process(self,text):
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = '\n'.join(chunk for chunk in chunks if chunk)
        return text
    def analyze(self,text):
        pattern = re.compile("^[a-zA-Z]+$")
        stop_words = set(stopwords.words('english'))
        tokens = word_tokenize(text)
        cleaned = [term for term in tokens if not pattern.match(term) is None]
        filtered_text = [term.lower() for term in cleaned if not term in stop_words]
        ss = SnowballStemmer("english")
        analyzed = [ss.stem(w) for w in filtered_text]
        return analyzed
    def info_to_pos_info(self,info):
        pos_info = dict()
        for data in info:
            ps=[]
            ps.append(str(len(info[data]))) #df
            for src_id in info[data]:
                ps.append(src_id)
                ps.append(str(len(info[data][src_id])))
                ps.extend(info[data][src_id])
            pos_info[data]=ps
        return pos_info
            
    def to_index(self):
        info=dict()
        i =0
        for src in os.listdir(self.src):
            src_path = os.path.join(self.src,src)
            if not os.path.isdir(src_path):
                src_id=str(self.id) + str(self.count)
                self.src_map[src_id] = src_path
                self.count+=1
                fetched=self.read(src_path)
                parsed = BeautifulSoup(fetched,features="html.parser")
                text =self.process(parsed.get_text())
                analyzed = self.analyze(text)
                before=0
                for index in range(len(analyzed)):
                    if not analyzed[index] in info:
                        info[analyzed[index]]= dict()
                    if not src_id in info[analyzed[index]]:
                        info[analyzed[index]][src_id]= list()
                    if len(info[analyzed[index]][src_id]) == 0:
                        before = 0
                    else:
                        before = int(info[analyzed[index]][src_id][0])
                    info[analyzed[index]][src_id].append(str((index+1)- before))
                i+=1
                #if i==5:
                #    break
        pos_info = self.info_to_pos_info(info)
        return pos_info
    def write_index(self,pos_info):
        tf = open("index_"+self.id+"_terms.txt","w+")
        pf = open("index_"+self.id+"_postings.txt","w+")
        for term in pos_info:
            offset = pf.tell()
            tf.write(term+","+str(offset)+"\n")
            posting = ','.join(pos_info[term])
            pf.write(posting+"\n")
        tf.close()
        pf.close()
                
def process_corpus(path):
    doc_map = dict()
    for d in os.listdir(path):
        dp = os.path.join(path,d)
        if os.path.isdir(dp):
            ig = index_genertor(dp,d)
            pos_info = ig.to_index()
            ig.write_index(pos_info)
            doc_map.update(ig.src_map)
    dw = open("doc_map","w+")
    dw.write(json.dumps(doc_map))
    dw.close()
def read_index(index_id):
    tf = open("index_"+index_id+"_terms.txt","r")
    pf = open("index_"+index_id+"_postings.txt","r")
    pos_info =dict()
    for record in tf.readlines():
        record = record[:-1].split(',')
        term = record[0]
        offset = int(record[1])
        pf.seek(offset)
        entry = pf.readline()
        entry = entry[:-1].split(',')
        pos_info[term] = entry
    tf.close()
    pf.close()
    return pos_info

def merge_index(ids):
    minfo = dict() 
    for i in ids:
        pos_info = read_index(i)
        for term in pos_info:
            if not term in minfo:
                minfo[term]=pos_info[term]
            else:
                curr_df = int(minfo[term][0])
                new_df = int(pos_info[term][0])
                minfo[term][0] = str(curr_df + new_df)
                minfo[term].extend(pos_info[term][1:])
    return minfo
def write_merged_index(pos_info):
    tf = open("inverted_index_terms.txt","w+")
    pf = open("inverted_index_postings.txt","w+")
    for term in pos_info:
        offset = pf.tell()
        tf.write(term+","+str(offset)+"\n")
        posting = ','.join(pos_info[term])
        pf.write(posting+"\n")
    tf.close()
    pf.close()
def read_merged_index():
    tf = open("inverted_index_terms.txt","r")
    pf = open("inverted_index_postings.txt","r")
    pos_info =dict()
    for record in tf.readlines():
        record = record[:-1].split(',')
        term = record[0]
        offset = int(record[1])
        pf.seek(offset)
        entry = pf.readline()
        entry = entry[:-1].split(',')
        pos_info[term] = entry
    tf.close()
    pf.close()
    return pos_info
def process_query(text):
    pattern = re.compile("^[a-zA-Z]+$")
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    cleaned = [term for term in tokens if not pattern.match(term) is None]
    filtered_text = [term.lower() for term in cleaned if not term in stop_words]
    ss = SnowballStemmer("english")
    analyzed = [ss.stem(w) for w in filtered_text]
    return analyzed
def search_query(text):
    tokens = process_query(text)
    pos_info = read_merged_index()
    src = []
    for token in tokens:
        if token in pos_info:
            posting = pos_info[token]
            df = int(posting[0])
            i= 1
            src_id = 0
            while i < df:
                src_id+=1
                src.append(posting[src_id])
                src_id = src_id + int(posting[src_id +1])+1
                i+=1
    return src
def show_results(src):
    if src == []:
        print("Nothin Found Sir!.")
        return False
    d = open("doc_map","r")
    doc_map= json.loads(d.read())
    d.close()
    print("Found results in")
    for i in src:
        print(doc_map[i])



#main
process_corpus('corpus\corpus1')
minfo = merge_index(["1","2","3"])
write_merged_index(minfo)
src = search_query("murtaza")
show_results(src)

[nltk_data] Downloading package punkt to C:\Users\murtaza
[nltk_data]     hussain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\murtaza
[nltk_data]     hussain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Nothin Found Sir!.


False