# Self Study 3

This self study concludes our first "miniproject" on crawling and search. The tasks for this self study are:
- modify/extend the inverted index you constructed in the previous self study to contain for all postings the term frequencies (if your documents are just the titles of the web pages, you will see very few term frequencies larger than 1, but do not worry about that).
- calculate the idf values for all terms, and also include them in your index (cf. slide 3.20 for a schematic view)
- implement ranked retrieval as described on slides 3.19 and 3.20 for the ntc.bnc similarity metric 

In [1]:
from queue import SimpleQueue as Queue
import random
from datetime import datetime
from datetime import timedelta
from urllib.robotparser import RobotFileParser
import requests
from bs4 import BeautifulSoup
import math
import nltk
nltk.download('punkt')
ps=nltk.PorterStemmer()


# Keep lists of visited pages and a result of urls and titles
visited = []
results = []

# Initialize frontier of 10 front queues that are assigned randomly
frontier = []
for i in range(4):
    frontier.append(Queue())

def enqueue(qlist, obj):
    q = random.choice(qlist)
    q.put(obj)



# Keep back queues as dictionary
backQ = {}

# Extract the next url crawl from back queues
def get_crawl(qd):
    result = None
    keys = list(qd)
    i = 0
    sec2 = timedelta(0,2)
    # Search through each back queue
    keys.sort(key=lambda x: qd[x]['time'])
    while result is None and i < len(keys):
        key = keys[i]
        if not qd[key]['queue'].empty() and datetime.now() > qd[key]['time']:
            # This queue is not empty and the timestamp permits
            result = qd[key]['queue'].get()
            # Update with new timestamp
            qd[key]['time'] = datetime.now() + sec2

        else:
            i += 1
    return result

# Approximate host domain
def extract_domain(url):
    s = url.split("/")
    return s[2]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bruger\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# define start seed
seed = ["https://pinchofyum.com/", "https://loveandlemons.com/","https://imdb.com/", "https://gamegrumps.com/", "https://www.aau.dk/"]

# Put seeds in frontier
for s in seed:
    enqueue(frontier, s)

index = 0

while len(results) < 1000:
    next_url = get_crawl(backQ)
    if not backQ or get_crawl(backQ) is None:
        # If all back queues are empty, refill by emptying front queues
        for f in frontier:
            while not f.empty():
                url = f.get()
                domain = extract_domain(url)
                if domain not in backQ.keys():
                    # Add new back queue if one for this domain does not exist
                    backQ[domain] = {'time': datetime.now(), 'queue': Queue()}
                backQ[domain]['queue'].put(url)

    else:
        try:
            # Logic for crawling a page
            if next_url in visited or next_url is None:
                continue
            print('crawling at ' + next_url)
            # Initialize robotfile parser
            rp=RobotFileParser()
            rp.set_url(next_url)
            rp.read()
            # Check if robots.txt allows
            if rp.can_fetch("*", next_url):
                r=requests.get(next_url)
                visited.append(next_url)
                #extract title
                r_parse = BeautifulSoup(r.text, 'html.parser')
                title = r_parse.find('title')
                if title is not None:
                    title = title.string
                    # Get text from page and tokenize
                    text = r_parse.get_text()
                    tokens = nltk.word_tokenize(title)
                    tokens.extend(nltk.word_tokenize(text))
                    toks = []
                    for token in tokens:
                        toks.append(ps.stem(token))
                    # save result
                    res = {'url': next_url, 'title': title, 'tokens': toks, 'id': index}
                    index += 1
                    results.append(res)
                    for a in r_parse.find_all('a'):
                        if 'href' in a.attrs:
                            l = a['href']
                            if l.startswith('https') and l not in visited:
                                enqueue(frontier, l)
            else:
                print('could not crawl at ' + next_url)
        except:
            print(f'woops at {next_url}')

In [None]:
# Calculate idf value for a term and index
def idf(term, idx):
    if term in idx.keys():
        return math.log10( len(idx) / len(idx[term]))
    else:
        return 0

# Build index
inv_index = {}
for e in results:
    d = {}
    for t in e['tokens']:
        if t in d.keys():
            d[t] += 1
        else:
            d[t] = 1
    for k in d:
        if k in inv_index.keys():
            inv_index[k][e['id']] = d[k]
        else:
            inv_index[k] = {e['id']: d[k]}

# Insert idf values in index
for term in inv_index.keys():
    inv_index[term]['*idf'] = idf(term, inv_index)

In [2]:
import json
with open("crawled.json", "r") as infile:
          crawled = json.load(infile)

with open("inverted_index.json", "r") as infile:
          inv_index = json.load(infile)

In [3]:
import numpy as np
# Ranked search on text match
def search(query):
    # Preprocess query
    words = query.split(" ")
    stemmed = []
    for w in words:
        stemmed.append(ps.stem(w))
    words = np.array(stemmed)

    docs = {}
    i = 0
    # Build vector for each document with matching words
    for w in words:
        if w in inv_index.keys():
            d = inv_index[w]
            for k in d.keys():
                if k != '*idf':
                    if k not in docs.keys():
                        docs[k] = np.zeros(len(words))
                    docs[k][i] = d[k] * d['*idf']
        i += 1
    # Transform to unit vectors
    for k in docs.keys():
        v = docs[k]
        vlen = np.sqrt(np.sum(np.apply_along_axis(lambda x: x**2,0, v)))
        docs[k] = v / vlen
    results = []
    # Construct query vector
    words = np.ones(len(words))
    wlen = np.sqrt(np.sum(np.apply_along_axis(lambda x: x**2,0, words)))
    words = words /wlen
    # Calculate cosine similarity for each document vetor
    for k in docs.keys():
        results.append((k, np.dot(docs[k], words)))
    # Sort and select top 10
    results.sort(reverse=True, key=lambda x: x[1])
    results = results[:10]
    print(results)
    rep = []
    # Translate ids to urls
    for i in range(len(results)):
        t = int(results[i][0])
        rep.append(crawled[t]['url'])
    return rep

In [15]:
search("pasta tomato meat")

[('870', 0.9956356178015583), ('0', 0.9023519251550977), ('43', 0.8373621693312493), ('37', 0.8164050363872593), ('56', 0.8164050363872593), ('136', 0.8164050363872593), ('232', 0.8164050363872593), ('239', 0.8164050363872593), ('706', 0.8164050363872593), ('971', 0.8164050363872593)]


['https://www.loveandlemons.com/chickpea-salad-sandwich/',
 'https://loveandlemons.com/',
 'https://www.loveandlemons.com/easy-dinner-ideas/',
 'https://www.loveandlemons.com/butternut-squash-soup/',
 'https://www.loveandlemons.com/how-to-cook-brown-rice/',
 'https://www.cedarsfoods.com/',
 'https://www.loveandlemons.com/healthy-lunch-ideas/',
 'https://www.loveandlemons.com/farro/',
 'https://www.loveandlemons.com/focaccia/',
 'https://www.loveandlemons.com/stuffed-zucchini-boats/']