# Crawler:

In [2]:
import re
import requests
from bs4 import BeautifulSoup
import numpy as np

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet

stop_words = stopwords.words("english")

In [5]:
class Crawler:
    def __init__(self):
        self.links_to_visit = []
        self.visited_links = []
        self.inv_index = dict()
        self.tf = dict()
        self.df = dict()
        self.doc_length = dict()

    def get_new_link(self):
        current_link = self.links_to_visit.pop()
        while (current_link in self.visited_links) and (len(self.links_to_visit) > 0):
            current_link = self.links_to_visit.pop()
        return current_link


    def extract_links(self, content):
        all_links = re.findall(r'<a href="(https://www.bbc.com/[^"]+)', content)
        self.links_to_visit.extend(all_links)


    def find_paragraphs(self, content):
        soup = BeautifulSoup(content, 'html.parser')
        paragraphs = soup.find_all('p')
        headings1 = soup.find_all('h1')
        headings2 = soup.find_all('h2')
        headings3 = soup.find_all('h3')
        headings4 = soup.find_all('h4')
        headings5 = soup.find_all('h5')
        headings6 = soup.find_all('h6')
        text = ''

        for headings in [headings1, headings2, headings3, headings4, headings5, headings6]:
            for h in headings:
                text += h.get_text(separator='\n') + '\n'

        for p in paragraphs:
            text += p.get_text(separator='\n') + '\n'

        return text


    def tf_m(self, term, doc, ln):
        tf1 = dict()
        l = term + '.' + doc
        if l in self.tf.keys():
            tf1[l] = self.tf[l]*ln + 1
        else:
            tf1[l] = 1
        self.tf[l] = tf1[l]/ln


    def inverted_index(self, text, current_link):
        wn_lemmatizer = WordNetLemmatizer()
        tokens = word_tokenize(text)
        self.doc_length[current_link] = len(tokens)
        tok = set()
        for t in tokens:
            t = t.lower()
            if t not in stop_words and t.isalpha():
                t = wn_lemmatizer.lemmatize(t)
                tok.add(t)
                self.tf_m(t, current_link, len(tokens))             #TF
                if t in self.inv_index.keys():
                    l = self.inv_index[t]
                    if current_link not in l:
                        l.append(current_link)
                elif t not in self.inv_index.keys():
                    l = [current_link]
                self.inv_index.update({t:l})

        #df
        for t in tok:
            if t in self.df.keys():
                self.df[t] += 1
            else:
                self.df[t] = 1


    def crawl(self, start_url, max_links):
        self.links_to_visit = [start_url]
        while (len(self.links_to_visit) > 0) and (len(self.visited_links) < max_links):
            current_link = self.get_new_link()
            try:
                res = requests.get(current_link)
                if res.status_code == 200:
                    page_content = res.content
                    paragraph = self.find_paragraphs(page_content)
                    self.inverted_index(paragraph, current_link)
                    self.extract_links(str(page_content))
                    self.visited_links.append(current_link)
            except Exception as e:
                print(f"Error crawling {current_link}: {e}")

# Search:

In [6]:
def find_docs(term, inv_index):
    term = term.lower()
    wn_lemmatizer = WordNetLemmatizer()
    word = wn_lemmatizer.lemmatize(term)
    synset = wordnet.synsets(word)
    synonyms = [s.name().split('.')[0] for s in synset]
    synonyms = list(set(synonyms))
    docs_ids = []
    for s in synonyms:
        m = inv_index.get(s)
        if m:
            docs_ids.extend(m)
    return docs_ids

In [7]:
def search(term, inv_index):
    all_docs = []
    if find_docs(term, inv_index):
        all_docs.extend(find_docs(term, inv_index))
    return all_docs

# TF_IDF:

In [8]:
def tf_idf(term, doc, crawler):
    tf = crawler.tf.get(term + '.' + doc)
    df = crawler.df.get(term)
    if tf and df:
        idf = np.log10(len(crawler.inv_index.keys()) / (df + 1))
        return tf * idf
    else:
        return 0

In [9]:
def sort_tf_idf(term, res_docs, crawler):
    tf_idf_t = dict()
    for d in res_docs:
        tf_idf_t[d] = tf_idf(term, d, crawler)
    sorted_dict = dict(sorted(tf_idf_t.items(), key=lambda item: item[1]))
    l = list(sorted_dict.keys())
    return l

# Okapi bm25f:

In [26]:
def okapi_bm25f(term, doc, crawler, k1, b):
    tf = crawler.tf.get(term + '.' + doc)
    df = crawler.df.get(term)
    if tf and df:
        idf = np.log10(len(crawler.inv_index.keys()) / (df + 1))
        avg_doc_length = sum(crawler.doc_length.values())/len(crawler.doc_length.keys())
        return idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (crawler.doc_length[doc] / avg_doc_length)))
    else:
        return 0

In [11]:
def search_okapi_bm25f(term, crawler, res_docs, k1=1.5, b=0.75):
    # Calculate Okapi BM25F scores for each document
    okapi_bm25f_scores = dict()
    for d in res_docs:
        okapi_bm25f_scores[d] = okapi_bm25f(term, d, crawler, k1, b)

    # Sort documents by Okapi BM25F scores in descending order
    sorted_docs = sorted(okapi_bm25f_scores.keys(), key=lambda x: okapi_bm25f_scores[x], reverse=True)
    return sorted_docs

# Test:

In [22]:
crawler = Crawler()
crawler.crawl('https://www.bbc.com/news', 40)

In [23]:
result = search('question', crawler.inv_index)
result

['https://www.bbc.com/news/live/world-africa-67745691?src_origin=BBCS_BBC',
 'https://www.bbc.com/news/live/uk-politics-67890460?src_origin=BBCS_BBC',
 'https://www.bbc.com/sport/live/football/67711240?src_origin=BBCS_BBC',
 'https://www.bbc.com/news',
 'https://www.bbc.com/culture',
 'https://www.bbc.com/reel',
 'https://www.bbc.com/news/live/world-africa-67745691?src_origin=BBCS_BBC',
 'https://www.bbc.com/video',
 'https://www.bbc.com/business',
 'https://www.bbc.com/news/live/uk-politics-67890460?src_origin=BBCS_BBC',
 'https://www.bbc.com/news/live/uk-politics-67910908?src_origin=BBCS_BBC',
 'https://www.bbc.com/innovation',
 'https://www.bbc.com/future',
 'https://www.bbc.com/travel',
 'https://www.bbc.com/reel',
 'https://www.bbc.com/future/earth',
 'https://www.bbc.com/video',
 'https://www.bbc.com/future-planet',
 'https://www.bbc.com/culture/article/20231222-moki-cherry-the-overlooked-swedish-artist-who-created-a-soulful-home',
 'https://www.bbc.com/future/article/20231229-th

In [24]:
res2 = sort_tf_idf('question', result, crawler)
res2

['https://www.bbc.com/news/live/world-africa-67745691?src_origin=BBCS_BBC',
 'https://www.bbc.com/sport/live/football/67711240?src_origin=BBCS_BBC',
 'https://www.bbc.com/culture',
 'https://www.bbc.com/reel',
 'https://www.bbc.com/video',
 'https://www.bbc.com/business',
 'https://www.bbc.com/news/live/uk-politics-67910908?src_origin=BBCS_BBC',
 'https://www.bbc.com/innovation',
 'https://www.bbc.com/future',
 'https://www.bbc.com/travel',
 'https://www.bbc.com/future/earth',
 'https://www.bbc.com/future-planet',
 'https://www.bbc.com/culture/article/20231222-moki-cherry-the-overlooked-swedish-artist-who-created-a-soulful-home',
 'https://www.bbc.com/news/live/uk-politics-67890460?src_origin=BBCS_BBC',
 'https://www.bbc.com/news',
 'https://www.bbc.com/future/article/20231229-the-problem-of-thinking-in-straight-lines',
 'https://www.bbc.com/news/world-africa-18930368']

In [27]:
res3 = search_okapi_bm25f('question', crawler, result)
res3

['https://www.bbc.com/news/world-africa-18930368',
 'https://www.bbc.com/future/article/20231229-the-problem-of-thinking-in-straight-lines',
 'https://www.bbc.com/news',
 'https://www.bbc.com/news/live/uk-politics-67890460?src_origin=BBCS_BBC',
 'https://www.bbc.com/news/live/world-africa-67745691?src_origin=BBCS_BBC',
 'https://www.bbc.com/sport/live/football/67711240?src_origin=BBCS_BBC',
 'https://www.bbc.com/culture',
 'https://www.bbc.com/reel',
 'https://www.bbc.com/video',
 'https://www.bbc.com/business',
 'https://www.bbc.com/news/live/uk-politics-67910908?src_origin=BBCS_BBC',
 'https://www.bbc.com/innovation',
 'https://www.bbc.com/future',
 'https://www.bbc.com/travel',
 'https://www.bbc.com/future/earth',
 'https://www.bbc.com/future-planet',
 'https://www.bbc.com/culture/article/20231222-moki-cherry-the-overlooked-swedish-artist-who-created-a-soulful-home']

# UI:

In [None]:
pip install PySimpleGUI

Collecting PySimpleGUI
  Downloading PySimpleGUI-4.60.5-py3-none-any.whl (512 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/512.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/512.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m512.0/512.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.7/512.7 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PySimpleGUI
Successfully installed PySimpleGUI-4.60.5


In [None]:
import PySimpleGUI as sg

sg.theme('LightGrey1')

layout = [
    [sg.Text('Enter URL:'), sg.InputText(key='url'), sg.Button('Crawl')],
    [sg.Text('Visited Links:')],
    [sg.Multiline(size=(65, 10), key='output', background_color='white', autoscroll=True)],
    [sg.Text('Search Word:'), sg.InputText(key='search_word'), sg.Button('Search')],
    [sg.Radio('None', "RADIO1", key='radio1', default=True),
     sg.Radio('tf_idf', "RADIO1", key='radio2'),
     sg.Radio('okapi_bm25f', "RADIO1", key='radio3')],
    [sg.Text('Search Result:')],
    [sg.Multiline(size=(65, 10), key='search_result', background_color='white', autoscroll=True)],
    [sg.Button('Exit')]
]

window = sg.Window('Web Crawler', layout)


while True:
    event, values = window.read()
    if event in (sg.WIN_CLOSED, 'Exit'):
        break

    elif event == 'Crawl':
        url = values['url']
        if url.strip():
            try:
                crawler = Crawler()
                crawler.crawl(url, 40)
                window['output'].update('\n'.join(crawler.visited_links))
            except Exception as e:
                sg.popup_error(f"Error while crawling: {e}")
        else:
            sg.popup_error("Please enter a valid URL.")

    elif event == 'Search':
        search_word = values['search_word']
        if search_word.strip():
            try:
                result = search(search_word, crawler.inv_index)

                if values['radio1']:
                    window['search_result'].update('\n'.join(result))
                elif values['radio2']:
                    res2 = sort_tf_idf(search_word, result, crawler)
                    window['search_result'].update('\n'.join(res2))
                elif values['radio3']:
                    res3 = search_okapi_bm25f(search_word, crawler, result)
                    window['search_result'].update('\n'.join(res3))
            except Exception as e:
                sg.popup_error(f"Error while searching: {e}")
        else:
            sg.popup_error("Please enter a search word.")

window.close()