In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings
import csv
import json
import flask_cors, flask
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS

warnings.filterwarnings('ignore')

In [6]:
app = Flask(__name__)
CORS(app)

class Node:
    def __init__(self, key, value):
        self.key = key
        self.value = value
        self.next = None

class LinkedList:
    def __init__(self):
        self.head = None

    def insert(self, key, value):
        new_node = Node(key, value)
        new_node.next = self.head
        self.head = new_node

    def search(self, key):
        current = self.head
        while current:
            if current.key == key:
                return current.value
            current = current.next
        return None
    
    def display(self):
        values = []
        current = self.head
        while current:
            values.append((current.key, current.value))
            current = current.next
        return values

class HashTable:
    def __init__(self, size):
        self.size = size
        self.table = [None] * size

    def hash_function(self, key):
        return hash(key) % self.size

    def insert(self, key, value): 
        index = self.hash_function(key)
        if self.table[index] is None:
            self.table[index] = LinkedList()
        self.table[index].insert(key, value)

    def search(self, key):
        index = self.hash_function(key)
        if self.table[index] is not None:
            return self.table[index].search(key)
        return None
    
    def display(self):
        table_values = []
        for i, linked_list in enumerate(self.table):
            if linked_list is not None:
                values = linked_list.display()
                for key, value in values:
                    table_values.append((i, key, value))
        return table_values
    
def build_forward_index(data):
    forward_index = HashTable(size=100)
    for article_id, title_words in data["title"].items(): 
        if article_id in data["content"]:
            content_words = data["content"][article_id] 
            words = title_words + content_words

            forward_index.insert(article_id, words)

    return forward_index 
 

def build_inverted_index(data):
    inverted_index = HashTable(size=100)
    for article_id, words in data["content"].items(): 
        topic = data["title"][article_id]
        url = data["url"][article_id]
    
        for word in words:
            if inverted_index.search(word):
                inverted_index.search(word).append({"article_id": article_id, "title": topic, "url": url})
            else:
                inverted_index.insert(word, [{"article_id": article_id, "title": topic, "url": url}])

    return inverted_index

def single_word_search(inverted_index, word):
    return inverted_index.search(word)

def multi_word_search_2(inverted_index, query):
    result = set()
    words = query.split()
    if words:
        result = set(inverted_index.search(words[0]))
        for word in words[1:]:
            result.intersection_update(inverted_index.search(word))
    return list(result)

def multi_word_search(inverted_index, query):
    result = []
    words = query.split()

    if words:
        result = inverted_index.search(words[0]) if inverted_index.search(words[0]) else []
        for word in words[1:]:
            current_result = inverted_index.search(word)
            if current_result:
                result = [d for d in result if d in current_result]

    return result


from collections import Counter

def rank_results(input_list): 
    counts = Counter(tuple(item['article_id']) for item in input_list) 
    sorted_list = sorted(input_list, key=lambda item: counts[tuple(item['article_id'])], reverse=True) 
    unique_set = set()
    output = [item for item in sorted_list if tuple(item['title']) not in unique_set and not unique_set.add(tuple(item['article_id']))]
    
    return output

def display_results(results):
    for article_id in results:
        print(f"Article {article_id}: {data['content'][article_id]}")

def add_content(data, new_article):
    article_id = str(len(data["index"]))
    data["index"][article_id] = len(data["index"])
    data["source"][article_id] = new_article[0]
    data["title"][article_id] = new_article[1]
    data["content"][article_id] = new_article[2]

    return data

def remove_duplicates(input_list, key = None):
    seen = set()
    result = []
    
    for item in input_list: 
        hashable_item = frozenset(item.items()) if key is None else item.get(key)
        
        if hashable_item not in seen:
            seen.add(hashable_item)
            result.append(item)
    
    return result

In [7]:
def load_data_from_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

json_file_path = "Files\cleaned.json"
json_data = load_data_from_json(json_file_path)

forward_index = build_forward_index(json_data)
inverted_index = build_inverted_index(json_data)

In [8]:
@app.route("/search_1", methods=["GET"], endpoint='single_word_search')
def single_word_search():
    word = request.args.get('word')
    ranked_results = rank_results(remove_duplicates(inverted_index.search(word), key="article_id"))

    article_ids = [result['article_id'] for result in ranked_results]
    titles = [result['title'] for result in ranked_results]
    urls = [result['url'] for result in ranked_results] 
    json_response = jsonify(article_ids=article_ids, titles=titles, urls=urls)

    return json_response

@app.route("/search_2", methods=["GET"], endpoint='multi_word_search')
def multi_word_search(): 
    query = request.args.get('word')
    result = []
    words = query.split()

    if words:
        result = inverted_index.search(words[0]) if inverted_index.search(words[0]) else []
        for word in words[1:]:
            current_result = inverted_index.search(word)
            if current_result:
                result = [d for d in result if d in current_result]

    ranked_results = rank_results(remove_duplicates(result, key="article_id"))

    article_ids = [result['article_id'] for result in ranked_results]
    titles = [result['title'] for result in ranked_results]
    urls = [result['url'] for result in ranked_results]

    json_response = jsonify(article_ids=article_ids, titles=titles, urls=urls)

    return json_response

    return jsonify(result) 

@app.route("/add", methods=["GET"])
def add_content(data, new_article):
    article_id = str(len(data["index"]))
    data["index"][article_id] = len(data["index"])
    data["source"][article_id] = new_article[0]
    data["title"][article_id] = new_article[1]
    data["content"][article_id] = new_article[2]

    return data

if __name__ == "__main__":
    app.run(debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [06/Dec/2023 20:11:57] "GET /search_1?word=ronaldo HTTP/1.1" 200 -
127.0.0.1 - - [06/Dec/2023 20:11:59] "GET /search_1?word=ronaldo HTTP/1.1" 200 -
127.0.0.1 - - [06/Dec/2023 20:12:00] "GET /search_1?word=ronaldo HTTP/1.1" 200 -
127.0.0.1 - - [06/Dec/2023 20:12:28] "GET /search_2?word=ronaldo%20is HTTP/1.1" 200 -
127.0.0.1 - - [06/Dec/2023 20:12:35] "GET /search_1?word=authority HTTP/1.1" 200 -
127.0.0.1 - - [06/Dec/2023 20:12:42] "GET /search_1?word=committee HTTP/1.1" 200 -
127.0.0.1 - - [06/Dec/2023 20:12:49] "GET /search_2?word=committeenv%20jhbj HTTP/1.1" 200 -
127.0.0.1 - - [06/Dec/2023 20:12:55] "GET /search_2?word=I%20am%20sick HTTP/1.1" 200 -
127.0.0.1 - - [06/Dec/2023 20:13:06] "GET /search_2?word=How%20to%20race%20a%20car HTTP/1.1" 200 -
127.0.0.1 - - [06/Dec/2023 20:13:08] "GET /search_2?word=How%20to%20race%20a%20car HTTP/1.1" 200 -
127.0.0.1 - - [06/Dec/2023 20:13:15] "GET /search_2?word=racing%20cars 