# Self Study 3

This self study concludes our first "miniproject" on crawling and search. The tasks for this self study are:
- modify/extend the inverted index you constructed in the previous self study to contain for all postings the term frequencies (if your documents are just the titles of the web pages, you will see very few term frequencies larger than 1, but do not worry about that).
- calculate the idf values for all terms, and also include them in your index (cf. slide 3.20 for a schematic view)
- implement ranked retrieval as described on slides 3.19 and 3.20 for the ntc.bnc similarity metric 

In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.robotparser import RobotFileParser
import random
import nltk
nltk.download('punkt')
from queue import SimpleQueue as Queue
from datetime import datetime
from datetime import timedelta
import json
import math
import numpy as np
ps=nltk.PorterStemmer()

def enqueue(qlist, obj):
    i = random.randint(0, len(qlist) -1)
    qlist[i].put(obj)

def extract_domain(url):
    s = url.split("/")
    return s[2]

def get_crawl(qd):
    result = None
    keys = list(qd)
    i = 0
    sec2 = timedelta(0,2)
    while result is None and i < len(keys):
        key = keys[i]
        if not qd[key]['queue'].empty() and datetime.now() > qd[key]['time']:
            result = qd[key]['queue'].get()
            qd[key]['time'] = datetime.now() + sec2

        else:
            i += 1
    return result


def idf(term, idx):
    if term in idx.keys():
        return math.log10( len(idx) / len(idx[term]))
    else:
        return 0
    



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bruger\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [None]:
seed = ["https://pinchofyum.com/", "https://loveandlemons.com/","https://imdb.com/", "https://gamegrumps.com/"]
frontier = []
for i in range(10):
    frontier.append(Queue())
for s in seed:
    enqueue(frontier, s)

visited = []
results = []
index = 0
backQ = {}
while len(results) < 1000:
    s = random.random()
    if s > 0.5:
        print(f'index size is now {index}')
    rp=RobotFileParser()
    next_url = get_crawl(backQ)
    if not backQ or get_crawl(backQ) is None:
        for i in range(len(frontier)-1):
            f = frontier[i]
            while not f.empty():
                url = f.get()
                domain = extract_domain(url)
                if domain not in backQ.keys():
                    backQ[domain] = {'time': datetime.now(), 'queue': Queue()}
                backQ[domain]['queue'].put(url)

    else:
        try:
            if next_url in visited or next_url is None:
                continue
            print('crawling at ' + next_url)
            rp.set_url(next_url)
            rp.read()

            if rp.can_fetch("*", next_url):
                r=requests.get(next_url)
                visited.append(next_url)
                r_parse = BeautifulSoup(r.text, 'html.parser')
                title = r_parse.find('title')
                if title is not None:
                    title = title.string
                    text = r_parse.get_text()
                    tokens = nltk.word_tokenize(title)
                    tokens.extend(nltk.word_tokenize(text))
                    toks = []
                    for token in tokens:
                        toks.append(ps.stem(token))
                    slup = {'url': next_url, 'title': title, 'tokens': toks, 'id': index}
                    print(f'adding {next_url} to index')
                    index += 1
                    results.append(slup)
                    for a in r_parse.find_all('a'):
                        if 'href' in a.attrs:
                            l = a['href']
                            if l.startswith('https') and l not in visited:
                                enqueue(frontier, l)
            else:
                print('could not crawl at ' + next_url)
        except:
            print(f'woops at {next_url}')


In [10]:
len(results)



1000

In [17]:
inv_index = {}
for e in results:
    d = {}
    for t in e['tokens']:
        if t in d.keys():
            d[t] += 1
        else:
            d[t] = 1
    for k in d:
        if k in inv_index.keys():
            inv_index[k][e['id']] = d[k]
        else:
            inv_index[k] = {e['id']: d[k]}
            

In [18]:
print(len(inv_index))

50284


In [19]:
len(results)

1000

In [21]:
with open("inverted_index.json", "w") as outfile:
    json.dump(inv_index, outfile)

In [None]:
with open("crawled.json", "w") as outfile:
    json.dump(results, outfile)

In [4]:
with open("crawled.json", "r") as infile:
          crawled = json.load(infile)
          
with open("inverted_index.json", "r") as infile:
          inv_index = json.load(infile)


In [23]:
def search(query):
    words = query.split(" ")
    stemmed = []
    for w in words:
        stemmed.append(ps.stem(w))
    words = np.array(stemmed)
    
    docs = {}
    i = 0
    for w in words:
        if w in inv_index.keys():
            d = inv_index[w]
            for k in d.keys():
                if k != '*idf':
                    if k not in docs.keys():
                        docs[k] = np.zeros(len(words))
                    docs[k][i] = d[k] * d['*idf']
        i += 1
    for k in docs.keys():
        v = docs[k]
        vlen = np.sqrt(np.sum(np.apply_along_axis(lambda x: x**2,0, v)))
        docs[k] = v / vlen
    results = []
    words = np.ones(len(words))
    wlen = np.sqrt(np.sum(np.apply_along_axis(lambda x: x**2,0, words)))
    words = words /wlen
    for k in docs.keys():
        results.append((k, np.dot(docs[k], words)))
    results.sort(reverse=True, key=lambda x: x[1])
    results = results[:10]
    print(results)
    rep = []
    for i in range(len(results)):
        t = int(results[i][0])
        rep.append(crawled[t]['url'])
    return rep

In [31]:
search("yeast")

[('18', 1.0), ('27', 1.0), ('235', 1.0), ('237', 1.0), ('352', 1.0), ('486', 1.0), ('560', 1.0), ('563', 1.0), ('589', 1.0), ('651', 1.0)]


['https://www.loveandlemons.com/summer-appetizers/',
 'https://www.loveandlemons.com/love-lemons-cooking-club-september/',
 'https://www.loveandlemons.com/vegan-recipes/',
 'https://www.loveandlemons.com/appetizers/',
 'https://www.loveandlemons.com/cashew-cream/',
 'https://www.loveandlemons.com/vegan-mac-and-cheese/',
 'https://www.loveandlemons.com/dinner-rolls-recipe/',
 'https://www.loveandlemons.com/no-knead-bread-recipe/',
 'https://www.loveandlemons.com/vegan-cheese/',
 'https://www.loveandlemons.com/soft-pretzels-recipe/']

In [22]:
vec = np.array([1,2,3])
vlen = np.sqrt(np.sum(np.apply_along_axis(lambda x: x**2,0, vec)))
ne = vec / vlen
np.sqrt(np.sum(np.apply_along_axis(lambda x: x**2,0, ne)))

1.0

In [25]:
b = {'a': 100, 'b': 2, 'c': 3}
max(b, key=lambda key: b[key])


'a'

In [33]:
for c in crawled:
    print(c['url'])

https://loveandlemons.com/
https://imdb.com/
https://www.loveandlemons.com/jalapeno-cornbread/
https://instagram.com/imdb
https://www.loveandlemons.com/honey-butter/
https://www.instagram.com/loveandlemons/
https://www.loveandlemons.com/baked-potato/
https://pro.imdb.com/login/ap?u=/login/lwa&imdbPageAction=signUp&rf=cons_nb_hm&ref_=cons_nb_hm
https://www.facebook.com/lovelemonsfood
https://twitch.tv/IMDb
https://www.loveandlemons.com/healthy-wraps/
https://contribute.imdb.com/czone?ref_=nv_cm_cz
https://m.imdb.com/chart/starmeter/?ref_=nv_cel_brn
https://www.loveandlemons.com/granola/
https://help.imdb.com/imdb
https://instagram.com/loveandlemons/
https://www.pinterest.com/loveandlemons/
https://www.boxofficemojo.com
https://www.loveandlemons.com/summer-appetizers/
https://pro.imdb.com/login/imdb?u=https%3A%2F%2Fpro.imdb.com%2Flogin%2Flwa%3Frf%3Dcons_nb_hm%26ref_%3Dcons_nb_hm&rf=cons_nb_hm
https://www.loveandlemons.com/falafel/
https://www.pinterest.com/pin/create/bookmarklet/?url=htt