# Self study 2


In this self-study we build an index that supports Boolean search over the web pages that you crawl with the crawler from the 1st self study. You can continue to just extract the titles of the web-pages you crawl, or you can be more adventurous and look at the whole text that you get from the .get_text() method of a BeautifulSoup parser. In either case, the collection of texts from the crawled web-pages is you corpus. You should then:

- construct the vocabulary of terms for your corpus
- build an 'inverted' index for your vocabulary
- implement Boolean search for your index (

In [None]:
# Some things already used in self study 1:
import requests
from bs4 import BeautifulSoup


A useful resource is the nltk natural language processing package:
https://www.nltk.org/
which provides methods for tokenization, stemming, and much more (the 'punkt' package is needed for tokenization):

In [None]:
import nltk
nltk.download('punkt')

Now let's use the title string of the AAU homepage as an example:

In [None]:
r=requests.get('https://www.aau.dk/')
r_parse = BeautifulSoup(r.text, 'html.parser')
string=r_parse.find('title').string
print(string)

We can tokenize:

In [None]:
tokens=nltk.word_tokenize(string)
for t in tokens:
    print(t)

And we can stem:

In [None]:
ps=nltk.PorterStemmer()
for t in tokens:
    print(ps.stem(t))



For Danish language the Porter stemmer will not be terribly useful! There is also a Danish option:

In [None]:
from nltk.stem.snowball import SnowballStemmer

dstemmer=SnowballStemmer("danish")

In [None]:
for t in tokens:
    print(dstemmer.stem(t))


What is most useful for you depends on which websites you crawl. It is not essential for the exercise that the stemming always is the best possible ...!

In [None]:
from queue import SimpleQueue as Queue
import random
from datetime import datetime
from datetime import timedelta
from urllib.robotparser import RobotFileParser


# Keep lists of visited pages and a result of urls and titles
visited = []
results = []

# Initialize frontier of 10 front queues that are assigned randomly
frontier = []
for i in range(4):
    frontier.append(Queue())

def enqueue(qlist, obj):
    q = random.choice(qlist)
    q.put(obj)



# Keep back queues as dictionary
backQ = {}

# Extract the next url crawl from back queues
def get_crawl(qd):
    result = None
    keys = list(qd)
    i = 0
    sec2 = timedelta(0,2)
    # Search through each back queue
    keys.sort(key=lambda x: qd[x]['time'])
    while result is None and i < len(keys):
        key = keys[i]
        if not qd[key]['queue'].empty() and datetime.now() > qd[key]['time']:
            # This queue is not empty and the timestamp permits
            result = qd[key]['queue'].get()
            # Update with new timestamp
            qd[key]['time'] = datetime.now() + sec2

        else:
            i += 1
    return result

# Approximate host domain
def extract_domain(url):
    s = url.split("/")
    return s[2]

In [None]:
# define start seed
seed = ["https://pinchofyum.com/", "https://loveandlemons.com/","https://imdb.com/", "https://gamegrumps.com/", "https://www.aau.dk/"]

# Put seeds in frontier
for s in seed:
    enqueue(frontier, s)

index = 0

while len(results) < 1000:
    next_url = get_crawl(backQ)
    if not backQ or get_crawl(backQ) is None:
        # If all back queues are empty, refill by emptying front queues
        for f in frontier:
            while not f.empty():
                url = f.get()
                domain = extract_domain(url)
                if domain not in backQ.keys():
                    # Add new back queue if one for this domain does not exist
                    backQ[domain] = {'time': datetime.now(), 'queue': Queue()}
                backQ[domain]['queue'].put(url)

    else:
        try:
            # Logic for crawling a page
            if next_url in visited or next_url is None:
                continue
            print('crawling at ' + next_url)
            # Initialize robotfile parser
            rp=RobotFileParser()
            rp.set_url(next_url)
            rp.read()
            # Check if robots.txt allows
            if rp.can_fetch("*", next_url):
                r=requests.get(next_url)
                visited.append(next_url)
                #extract title
                r_parse = BeautifulSoup(r.text, 'html.parser')
                title = r_parse.find('title')
                if title is not None:
                    title = title.string
                    # Get text from page and tokenize
                    text = r_parse.get_text()
                    tokens = nltk.word_tokenize(title)
                    tokens.extend(nltk.word_tokenize(text))
                    toks = []
                    for token in tokens:
                        toks.append(ps.stem(token))
                    # save result
                    res = {'url': next_url, 'title': title, 'tokens': toks, 'id': index}
                    index += 1
                    results.append(res)
                    for a in r_parse.find_all('a'):
                        if 'href' in a.attrs:
                            l = a['href']
                            if l.startswith('https') and l not in visited:
                                enqueue(frontier, l)
            else:
                print('could not crawl at ' + next_url)
        except:
            print(f'woops at {next_url}')

In [None]:
# Build inverted index of crawled pages
inv_index = {}
for e in results:
    d = {}
    for t in e['tokens']:
        if t in d.keys():
            d[t] += 1
        else:
            d[t] = 1
    for k in d:
        if k in inv_index.keys():
            inv_index[k][e['id']] = d[k]
        else:
            inv_index[k] = {e['id']: d[k]}


In [7]:
# Load inverted index from file
import json
with open("inverted_index.json", "r") as infile:
          inv_index = json.load(infile)
with open("crawled.json", "r") as infile:
          results = json.load(infile)

In [16]:
import nltk
nltk.download('punkt')
ps=nltk.PorterStemmer()
# AND-merge list of ids
def andMerge(l, m):
    result = []
    if len(l) == 0 or len(m) == 0:
        return result
    i = 0
    j = 0
    ie = l[i]
    je = m[j]
    cont = True
    while cont:
        k = ie-je
        if k == 0:
            # Ids match
            result.append(ie)
            if len(l) - 1 > i and len(m) - 1 > j:
                # Take a step in both lists
                i += 1
                j += 1
                ie = l[i]
                je = m[j]
            else:
                cont = False
        # Take a step in the list with the smallest id
        elif k < 0 and len(l)-1 > i:
            i += 1
            ie = l[i]
        elif k > 0 and len(m)-1 > j:
            j += 1
            je = m[j]
        else:
            cont = False
    return result

def get_ids(dic):
    res = list(dic.keys())
    res.remove("*idf")
    res = [int(i) for i in res]
    return res

def search(searchstring):
    # Split string into words
    words = searchstring.split(" ")
    imm = []
    # Process words
    for w in words:
        imm.append(ps.stem(w))
    words = imm
    ids = []
    if len(words) == 1:
        # If there is only one word, simply return that list
        w = words[0]
        if w in inv_index.keys():
            ids = get_ids(inv_index[w])
    else:
        # Merge multiple words
        w = words.pop(0)
        lis = []
        if w in inv_index.keys():
            lis = get_ids(inv_index[w])
        w = words.pop(0)
        lim = []
        if w in inv_index.keys():
            lim = get_ids(inv_index[w])
        imm = andMerge(lis,lim)
        for word in words:
            if word in inv_index.keys():
                lim = get_ids(inv_index[word])
            else:
                lim = []
            imm = andMerge(imm, lim)
        ids = imm
    res = []
    # Translate ids to urls
    for i in ids:
        for e in results:
            if e['id'] == i:
                res.append(e['url'])
    return res

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bruger\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
search("love and lemons beef")

['https://www.loveandlemons.com/easy-dinner-ideas/',
 'https://www.loveandlemons.com/cabbage-soup/',
 'https://www.loveandlemons.com/nachos-recipe/']