In [17]:
import pickle
import os
from pathlib import Path
from elasticsearch import Elasticsearch 
import json
import numpy as np
import pandas as pd

In [18]:

class Pr:

    def __init__(self, alpha):
        self.crawled_folder = Path("resources/crawled")
        self.alpha = alpha

    def url_extractor(self):
        url_maps = {}
        all_urls = set([])

        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                j = json.load(open(os.path.join(self.crawled_folder, file)))
                all_urls.add(j['url'])
                for s in j['url_lists']:
                    all_urls.add(s)
                url_maps[j['url']] = list(set(j['url_lists']))
        all_urls = list(all_urls)
        return url_maps, all_urls
  
    def pr_calc(self):
        url_maps, all_urls = self.url_extractor()
        url_matrix = pd.DataFrame(columns=all_urls, index=all_urls)

        for url in url_maps:
            if len(url_maps[url]) > 0 and len(all_urls) > 0:
                url_matrix.loc[url] = (1 - self.alpha) * (1 / len(all_urls))
                url_matrix.loc[url, url_maps[url]] = url_matrix.loc[url, url_maps[url]] + (self.alpha * (1 / len(url_maps[url])))

        url_matrix.loc[url_matrix.isnull().all(axis=1), :] = (1 / len(all_urls))
        x0 = np.matrix([1 / len(all_urls)] * len(all_urls))
        P = np.asmatrix(url_matrix.values)

        prev_Px = x0
        Px = x0 * P
        i = 0
        while (any(abs(np.asarray(prev_Px).flatten() - np.asarray(Px).flatten()) > 1e-8)):
            i += 1
            prev_Px = Px
            Px = Px * P

        print('Converged in {0} iterations: {1}'.format(i, np.around(np.asarray(Px).flatten().astype(float), 5)))

        self.pr_result = pd.DataFrame(Px, columns=url_matrix.index, index=['score']).T



In [None]:
import os
import json
import pickle
from pathlib import Path
from elasticsearch import Elasticsearch

class IndexerWithPR:
    def __init__(self):
        self.crawled_folder = Path('C:/Users/user/Documents/Year3_2/IR/resources/crawled')
        with open(self.crawled_folder / 'url_list.pickle', 'rb') as f:
            self.file_mapper = pickle.load(f)
        self.es_client = Elasticsearch("https://localhost:9200", 
                                       basic_auth=("elastic", "ySGo56ThrQ2moHl+WbG2"), 
                                       ca_certs="~/http_ca.crt")
        with open('pickled/pr_instance.pkl', 'rb') as f:
            self.pr = pickle.load(f)
        self.indexed_data_path = "indexed_data.pkl"

    def run_indexer(self):
        self.es_client.options(ignore_status=[400, 404]).indices.delete(index='simple')
        self.es_client.options(ignore_status=[400]).indices.create(index='simple')

        if os.path.exists(self.indexed_data_path):
            print("Loading indexed data from pickle file...")
            with open(self.indexed_data_path, "rb") as f:
                indexed_data = pickle.load(f)
        else:
            print("Processing files and creating indexed data...")
            indexed_data = []

            for file in os.listdir(self.crawled_folder):
                if file.endswith(".txt"):
                    with open(os.path.join(self.crawled_folder, file), 'r') as f:
                        j = json.load(f)
                    j['id'] = j['url']
                    j['pagerank'] = self.pr.pr_result.loc[j['id']].score
                    indexed_data.append(j)

            # Save processed data to a pickle file
            with open(self.indexed_data_path, "wb") as f:
                pickle.dump(indexed_data, f)
        
        # Send data to Elasticsearch
        for doc in indexed_data:
            self.es_client.index(index='simple', body=doc)


In [24]:
indexer = IndexerWithPR()
indexer.run_indexer()

In [None]:
from flask import Flask, request, render_template
import time
import pandas as pd

app = Flask(__name__)

@app.route('/')
def home():
    return render_template('index.html')

@app.route('/search_es_pr', methods=['GET'])
def search_es_pr():
    start = time.time()
    query_term = request.args.get('query', '')

    results = app.es_client.search(
        index='simple',
        source_excludes=['url_lists'],
        size=100,
        query={
            "script_score": {
                "query": {"match": {"text": query_term}},
                "script": {"source": "_score * doc['pagerank'].value"}
            }
        }
    )

    end = time.time()
    total_hit = results['hits']['total']['value']

    # Extract relevant fields
    results_list = [
        {
            'title': hit["_source"]['title'],
            'url': hit["_source"]['url'],
            'text': hit["_source"]['text'][:100]  # First 100 characters of text
        }
        for hit in results['hits']['hits']
    ]

    return render_template('search_results.html', query=query_term, results=results_list, total_hit=total_hit, elapse=end - start)




In [18]:
import pickle
from elasticsearch import Elasticsearch 
from flask import Flask,request
import time
import pandas as pd

In [None]:
import os
import json
import pickle
from pathlib import Path
from elasticsearch import Elasticsearch

class TFIDFRanker:
    def __init__(self):
        self.crawled_folder = Path('resources/crawled')
        
        # Load file_mapper for file paths
        with open(self.crawled_folder / 'url_list.pickle', 'rb') as f:
            self.file_mapper = pickle.load(f)
        
        # Initialize Elasticsearch client
        self.es_client = Elasticsearch("https://localhost:9200", 
                                       basic_auth=("elastic", "ySGo56ThrQ2moHl+WbG2"), 
                                       ca_certs="~/http_ca.crt")
        
        # Load PageRank instance
        with open('pr_instance.pkl', 'rb') as f:
            self.pr = pickle.load(f)

    def run_indexer(self):
        # Delete and create the index
        self.es_client.options(ignore_status=[400, 404]).indices.delete(index='extend')
        self.es_client.options(ignore_status=[400]).indices.create(index='extend')

        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                j = json.load(open(os.path.join(self.crawled_folder, file)))
                j['id'] = j['url']
                
                # Get PageRank score, ensure the key exists
                pagerank_score = self.pr.pr_result.get(j['id'], {}).get('score', 0)
                j['pagerank'] = pagerank_score

                # Fetch TF-IDF score (Assume it's fetched via a search query)
                search_result = self.es_client.search(index="extend", body={
                    "query": {
                        "match": {
                            "url": j['url']
                        }
                    },
                    "explain": True  # Enable explanation to get TF-IDF calculation
                })

                # Extract TF-IDF score from the search explanation
                if search_result['hits']['hits']:
                    explain = search_result['hits']['hits'][0]['_explanation']
                    tfidf_score = explain['value']
                else:
                    tfidf_score = 1  # Default value if no results are found

                # Combine both TF-IDF and PageRank scores for final ranking
                final_score = pagerank_score + tfidf_score
                j['final_score'] = final_score

                # Optionally, you can store both TF-IDF and PageRank scores separately for analysis
                j['tfidf_score'] = tfidf_score

                # Index the document with both scores
                self.es_client.index(index='extend', body=j)
                print(f"Indexed {j['url']} with PageRank: {pagerank_score}, TF-IDF: {tfidf_score}, Final Score: {final_score}")


In [22]:
from flask import Flask, request
import time
import pandas as pd
from elasticsearch import Elasticsearch

app = Flask(__name__)
app.es_client = Elasticsearch("https://localhost:9200", basic_auth=("elastic", "ySGo56ThrQ2moHl+WbG2"), ca_certs="~/http_ca.crt")

@app.route('/search_es_pr', methods=['GET'])
def search_es_pr():
    start = time.time()
    response_object = {'status': 'success'}
    
    # Retrieve query term from request
    argList = request.args.to_dict(flat=False)
    query_term = argList['query'][0]
    
    # Elasticsearch query with explain to retrieve TF-IDF scoring
    results = app.es_client.search(index='extend', source_excludes=['url_lists'], size=100,
                                   query={
                                       "match": {
                                           "text": query_term
                                       }
                                   },
                                   explain=True)  # Enable explanation to get detailed score breakdown
    
    end = time.time()
    
    total_hit = results['hits']['total']['value']
    
    # Process the hits to extract the title, url, snippet, TF-IDF score and other details
    results_df = pd.DataFrame([[
        hit["_source"]['title'], 
        hit["_source"]['url'], 
        hit["_source"]['text'][:100],  # Display first 100 characters of text
        hit["_score"],  # Use the score provided by Elasticsearch (which is now based on TF-IDF)
        hit["_explanation"]['value']  # Extract the TF-IDF score (or detailed explanation)
    ] for hit in results['hits']['hits']], columns=['title', 'url', 'text', 'score', 'tfidf_score'])

    # Prepare response data
    response_object['total_hit'] = total_hit
    response_object['results'] = results_df.to_dict('records')
    response_object['elapse'] = end - start

    return response_object


In [None]:
app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [13/Feb/2025 23:57:05] "GET /search_es_pr?query=camt HTTP/1.1" 200 -
