In [45]:
import multiprocessing
import os
import re
import time
import numpy as np
import pandas as pd
import requests
import json
from ManualIndexer import ManualIndexer
import pickle
from pathlib import Path
from BM25 import BM25
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4.element import Comment
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from queue import Queue, Empty
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor
from Pr import Pr
from elasticsearch import Elasticsearch

In [46]:
from flask import Flask, request

app = Flask(__name__)
app.es_client = Elasticsearch("https://localhost:9200", basic_auth=("elastic", "X7U+Q+3MvP3M*9xnjuBF"), ca_certs="~/http_ca.crt")
app.manual_indexer = ManualIndexer()

@app.route('/search_es', methods=['GET'])
def search_es():
    start = time.time()
    response_object = {'status': 'success'}
    argList = request.args.to_dict(flat=False)
    query_term = argList['query'][0]
    results = app.es_client.search(index='simple', source_excludes=['url_lists'], size=100,
                                   query={"match": {"text": query_term}})
    end = time.time()
    total_hit = results['hits']['total']['value']
    results_df = pd.DataFrame([[hit["_source"]['title'],hit["_source"]["url"],hit["_source"]['text'][:100],hit["_score"]] for hit in results['hits']['hits']], columns=['title','url','text','score'])

    response_object['total_hit'] = total_hit
    response_object['results'] = results_df.to_dict('records')
    response_object['elapse'] = end-start
    return response_object

@app.route('/search_manual', methods=['GET'])
def search_manual():
    start = time.time()
    response_object = {'status': 'success'}
    argList = request.args.to_dict(flat=False)
    query_term = argList['query'][0]

    # Perform manual indexing query
    results = app.manual_indexer.query(query_term)

    # Add HTML <b>..</b> tag to the query term and show two or three sentences surrounding it
    results['text'] = results['text'].apply(lambda x: bold_text_query(x, query_term))

    end = time.time()
    total_hit = len(results)
    results_df = results.drop('url_lists', axis=1)

    response_object['total_hit'] = total_hit
    response_object['results'] = results_df.to_dict('records')
    response_object['elapse'] = end-start
    return response_object

@app.route('/search', methods
=['GET'])
def search():
    start = time.time()
    respond_object = {'status': 'success'}
    argList = request.args.to_dict(flat=False)
    query_term = argList['query'][0]
    results = app.es_client.search(index='simple', source_excludes=['url_lists'], size=100,
                                   query={"script_score": {"query": {"match" : {"text": query_term}},
                                                           "script": {"source": "_score * doc['pagerank'].value"}}})
    end = time.time()
    total_hit = results['hits']['total']['value']
    results_df = pd.DataFrame([[hit["_source"]["title"], hit["_source"]['url'], hit["_source"]["text"][:100], hit["_source"]] for hit in results['hits']['hits']], columns=['title', 'url', 'text', 'score'])

    respond_object['total_hit'] = total_hit
    respond_object['results'] = results_df.to_dict('records')
    respond_object['elapse'] = end -start
    return respond_object

def bold_text_query(text, query_term, context_length=100):
    # Highlight the query term in the text and show only two or three sentences surrounding it
    sentences = re.split(r'(?<=[.!?]) +', text)
    bold_text = ""
    for sentence in sentences:
        if query_term.lower() in sentence.lower():
            start_idx = max(sentence.lower().find(query_term.lower()) - context_length, 0)
            end_idx = min(sentence.lower().find(query_term.lower()) + len(query_term.lower()) + context_length, len(sentence))
            bold_sentence = sentence[start_idx:end_idx]
            bold_sentence = bold_sentence.replace(query_term, f"<b>{query_term}</b>")
            bold_text += bold_sentence + "..."
            break
    return bold_text


if __name__ == '__main__':
    app.run(debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m
127.0.0.1 - - [25/Feb/2024 21:41:52] "GET /search_manual?query=camt HTTP/1.1" 200 -


This is BM25  229
This is Hit  229


<h1>Discuss about how this new mix of scores makes finding things better or worse.
</h1>

- Combining BM25 with PageRank, the search system can better prioritize pages that are not only relevant to the query but also authoritative and widely recognized across the web. This means users are more likely to find more relevance at the top of search results, leading to a better overall search experience.