# Chapter xx

*Data Structures and Information Retrieval in Python*

Copyright 2021 Allen Downey

License: [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International](https://creativecommons.org/licenses/by-nc-sa/4.0/)

In [83]:
from os.path import basename, exists

def download(url):
    filename = basename(url)
    if not exists(filename):
        from urllib.request import urlretrieve
        local, _ = urlretrieve(url, filename)
        print('Downloaded ' + local)
    return filename
    
# download('https://github.com/AllenDowney/DSIRP/raw/main/utils.py')

[Click here to run this chapter on Colab](https://colab.research.google.com/github/AllenDowney/DSIRP/blob/main/chapters/chap01.ipynb)

In [1]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [24]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(html_doc)
type(soup)

bs4.BeautifulSoup

In [25]:
def iterative_DFS(root):
    stack = []
    stack.append(root)
    
    while(stack):
        tag = stack.pop()
        yield tag

        children = getattr(tag, "contents", [])
        for child in reversed(children):
            stack.append(child)

In [26]:
from bs4 import NavigableString

for element in iterative_DFS(soup):
    if isinstance(element, NavigableString):
        print(element.string, end='')

The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...


In [58]:
from string import whitespace, punctuation

def iterate_words(soup):
    for element in iterative_DFS(soup):
        if isinstance(element, NavigableString):
            for word in element.string.split():
                word = word.strip(whitespace + punctuation)
                if word:
                    yield word

In [59]:
for word in iterate_words(soup):
    print(word)

The
Dormouse's
story
The
Dormouse's
story
Once
upon
a
time
there
were
three
little
sisters
and
their
names
were
Elsie
Lacie
and
Tillie
and
they
lived
at
the
bottom
of
a
well


In [54]:
from bs4 import NavigableString

text = soup.get_text()
for word in text.split():
    print(word.strip(whitespace + punctuation))

The
Dormouse's
story
The
Dormouse's
story
Once
upon
a
time
there
were
three
little
sisters
and
their
names
were
Elsie
Lacie
and
Tillie
and
they
lived
at
the
bottom
of
a
well



In [60]:
from collections import Counter

counter = Counter(iterate_words(soup))
counter

Counter({'The': 2,
         "Dormouse's": 2,
         'story': 2,
         'Once': 1,
         'upon': 1,
         'a': 2,
         'time': 1,
         'there': 1,
         'were': 2,
         'three': 1,
         'little': 1,
         'sisters': 1,
         'and': 3,
         'their': 1,
         'names': 1,
         'Elsie': 1,
         'Lacie': 1,
         'Tillie': 1,
         'they': 1,
         'lived': 1,
         'at': 1,
         'the': 1,
         'bottom': 1,
         'of': 1,
         'well': 1})

In [84]:
url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
filename = download(url)

In [85]:
fp = open(filename)
soup2 = BeautifulSoup(fp)

In [61]:
counter2 = Counter(iterate_words(soup2))
counter2.most_common(10)

[('Python', 426),
 ('the', 270),
 ('and', 265),
 ('a', 218),
 ('of', 196),
 ('to', 173),
 ('Retrieved', 167),
 ('in', 149),
 ('is', 134),
 ('for', 118)]

This list suggests TF/IDF.

In [64]:
!redis-server --daemonize yes

128077:C 10 Aug 2021 14:07:17.642 # oO0OoO0OoO0Oo Redis is starting oO0OoO0OoO0Oo
128077:C 10 Aug 2021 14:07:17.642 # Redis version=5.0.3, bits=64, commit=00000000, modified=0, pid=128077, just started
128077:C 10 Aug 2021 14:07:17.642 # Configuration loaded


In [65]:
import redis

r = redis.Redis()

In [79]:
key = f'Counter(test)'
key

'Counter(test)'

In [80]:
for word in iterate_words(soup):
    r.hincrby(key, word, 1)

In [82]:
r.hget(key, 'The')

b'4'

In [73]:
words = r.hkeys(key)
words

[b'The',
 b"Dormouse's",
 b'story',
 b'Once',
 b'upon',
 b'a',
 b'time',
 b'there',
 b'were',
 b'three',
 b'little',
 b'sisters',
 b'and',
 b'their',
 b'names',
 b'Elsie',
 b'Lacie',
 b'Tillie',
 b'they',
 b'lived',
 b'at',
 b'the',
 b'bottom',
 b'of',
 b'well']

In [75]:
for item in r.hscan_iter(key):
    print(item)

(b'The', b'2')
(b"Dormouse's", b'2')
(b'story', b'2')
(b'Once', b'1')
(b'upon', b'1')
(b'a', b'2')
(b'time', b'1')
(b'there', b'1')
(b'were', b'2')
(b'three', b'1')
(b'little', b'1')
(b'sisters', b'1')
(b'and', b'3')
(b'their', b'1')
(b'names', b'1')
(b'Elsie', b'1')
(b'Lacie', b'1')
(b'Tillie', b'1')
(b'they', b'1')
(b'lived', b'1')
(b'at', b'1')
(b'the', b'1')
(b'bottom', b'1')
(b'of', b'1')
(b'well', b'1')


In [88]:
def redis_word_count(url):
    filename = download(url)
    soup = BeautifulSoup(open(filename))

    key = f'Counter({url})'
    r.delete(key)
    
    for word in iterate_words(soup):
        r.hincrby(key, word, 1)
        
    return key

In [89]:
key = redis_word_count(url)

In [90]:
r.hget(key, 'Python')

b'426'

TODO: Count the words locally and upload the counter to Redis

# Indexer

At this point we have built a basic Web crawler; the next piece we will
work on is the **index**. In the context of web search, an index is a
data structure that makes it possible to look up a search term and find
the pages where that term appears. In addition, we would like to know
how many times the search term appears on each page, which will help
identify the pages most relevant to the term.

For example, if a user submits the search terms "Java" and
"programming", we would look up both search terms and get two sets of
pages. Pages with the word "Java" would include pages about the island
of Java, the nickname for coffee, and the programming language. Pages
with the word "programming" would include pages about different
programming languages, as well as other uses of the word. By selecting
pages with both terms, we hope to eliminate irrelevant pages and find
the ones about Java programming.

Now that we understand what the index is and what operations it
performs, we can design a data structure to represent it.

In [94]:
for word, count in counter.items():
    print(word, count)

The 2
Dormouse's 2
story 2
Once 1
upon 1
a 2
time 1
there 1
were 2
three 1
little 1
sisters 1
and 3
their 1
names 1
Elsie 1
Lacie 1
Tillie 1
they 1
lived 1
at 1
the 1
bottom 1
of 1
well 1


In [95]:
def redis_index(url):
    filename = download(url)
    soup = BeautifulSoup(open(filename))
    counter = Counter(iterate_words(soup))
    for word, count in counter.items():
        key = f'Index({word})'
        r.hset(key, url, count)

In [98]:
key = f'Index(Python)'
r.hget(key, url)

b'426'

In [100]:
for page, count in r.hscan_iter(key):
    print(page, count)

b'https://en.wikipedia.org/wiki/Python_(programming_language)' b'426'


In [None]:
!killall redis-server