# 1. Crawler


The ```WGET``` file is attached in moodle as well as in my [repo](https://github.com/Ehsan2754/information-retrieval/blob/master/templates/2022S/2022S-01/wget.py).  The file content is as following: 

In [12]:
import argparse
import os
import re
import requests


def wget(url, filename):
    # allow redirects - in case file is relocated
    resp = requests.get(url, allow_redirects=True)
    # this can also be 2xx, but for simplicity now we stick to 200
    # you can also check for `resp.ok`
    if resp.status_code != 200:
        print(resp.status_code, resp.reason, 'for', url)
        return
    
    # just to be cool and print something
    print(*[f"{key}: {value}" for key, value in resp.headers.items()], sep='\n')
    print()
    
    # try to extract filename from url
    if filename is None:
        # start with http*, ends if ? or # appears (or none of)
        m = re.search("^http.*/([^/\?#]*)[\?#]?", url)
        filename = m.group(1)
        if not filename:
            raise NameError(f"Filename neither given, nor found for {url}")

    # what will you do in case 2 websites store file with the same name?
    if os.path.exists(filename):
        raise OSError(f"File {filename} already exists")
    
    with open(filename, 'wb') as f:
        f.write(resp.content)
        print(f"File saved as {filename}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='download file.')
    parser.add_argument("-O", type=str, default=None, dest='filename', help="output file name. Default -- taken from resource")
    parser.add_argument("url", type=str, default=None, help="Provide URL here")
    args = parser.parse_args()
    wget(args.url, args.filename)

usage: ipykernel_launcher.py [-h] [-O FILENAME] url
ipykernel_launcher.py: error: the following arguments are required: url


SystemExit: 2

## 1.1. [15] Download and persist #
Please complete a code for `load()`, `download()` and `persist()` methods of `Document` class. What they do:
- for a given URL `download()` method downloads binary data and stores in `self.content`. It returns `True` for success, else `False`.
- `persist()` method saves `self.content` somewhere in file system. We do it to avoid multiple downloads (for caching in other words).
- `load()` method loads data from hard drive. Returns `True` for success.

Tests checks that your code somehow works.

**NB Passing the test doesn't mean you correctly completed the task.** These are **criteria, which have to be fullfilled**:
1. URL is a unique identifier (as it is a subset of URI). Thus, documents with different URLs should be stored in different files. Typical errors: documents from the same domain are overwritten to the same file, URLs with similar endings are downloaded to the same file, etc.
2. The document can be not only a text file, but also a binary. Pay attention that if you download `mp3` file, it still can be played. Hint: don't hurry to convert everything to text.

In [2]:
import requests
from urllib.parse import quote
import hashlib
import re
import argparse
import os
import re
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry


class Document:
    BAD_REQUEST = -1
    FILE_EXISTS = 0
    SUCCESS = 1
    def __alt_wget(self,url, filename):
        # allow redirects - in case file is relocated
        session = requests.Session()
        retry = Retry(connect=3, backoff_factor=0.5)
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('http://', adapter)
        session.mount('https://', adapter)

        resp = requests.get(self.url,allow_redirects=True)
        #  requests.get(url, allow_redirects=True)
        # this can also be 2xx, but for simplicity now we stick to 200
        # you can also check for `resp.ok`
        if resp.status_code != 200:
            print(resp.status_code, resp.reason, 'for', url)
            return (self.BAD_REQUEST,None)
        
        # just to be cool and print something
        print(*[f"{key}: {value}" for key, value in resp.headers.items()], sep='\n')
        print()
        return (self.SUCCESS,resp.content)


    
    def __init__(self, url):
        self.url = url
        # m = re.search("^http.*/([^/\?#]*)[\?#]?", url)
        # self.filename = m.group(1)
        # if not self.filename:
        #     raise NameError(f"Filename neither given, nor found for {url}")
        self.filename = hashlib.sha256(url.encode()).hexdigest()
        
    def get(self):
        if not self.load():
            if not self.download():
                raise FileNotFoundError(self.url)
            else:
                self.persist()
    
    def download(self):
        #TODO download self.url content, store it in self.content and return True in case of success
        result,self.content = self.__alt_wget(self.url,self.filename)
        return (result==self.SUCCESS)
    
    def persist(self):
        #TODO write document content to hard drive

        # what will you do in case websites store file with the same name?
        if os.path.exists(self.filename):
             raise OSError(f"File {self.filename} already exists")
        with open(self.filename, 'wb') as f:
            f.write(self.content)
            print(f"File saved as {self.filename}")
            
    def load(self):
        #TODO load content from hard drive, store it in self.content and return True in case of success
        if not os.path.exists(self.filename):
            return False
        with open(self.filename, 'rb') as f:
            self.content = f.read()
            print(f"File loaded: {self.filename}")
        return True

### 1.1.1. Tests ###

In [3]:
doc = Document('http://sprotasov.ru/data/iu.txt')

doc.get()
assert doc.content, "Document download failed"
assert "Code snippets, demos and labs for the course" in str(doc.content), "Document content error"

doc.get()
assert doc.load(), "Load should return true for saved document"
assert "Code snippets, demos and labs for the course" in str(doc.content), "Document load from disk error"

File loaded: e3f92252000cd79aa00e286a050c7825ff95897ad1c4d62c4e43bf948085d14c
File loaded: e3f92252000cd79aa00e286a050c7825ff95897ad1c4d62c4e43bf948085d14c
File loaded: e3f92252000cd79aa00e286a050c7825ff95897ad1c4d62c4e43bf948085d14c


## 1.3. [10] Parse HTML ##
`BeautifulSoap` library is a de facto standard to parse XML and HTML documents in python. Use it to complete `parse()` method that extracts document contents. You should initialize:
- `self.anchors` list of tuples `('text', 'url')` met in a document. Be aware, there exist relative links (e.g. `../content/pic.jpg`). Use `urllib.parse.urljoin()` to fix this issue.
- `self.images` list of images met in a document. Again, links can be relative to current page.
- `self.text` should keep plain text of the document without scripts, tags, comments and so on. You can refer to [this stackoverflow answer](https://stackoverflow.com/a/1983219) for details.

**NB All these 3 criteria must be fulfilled to get full point for the task.**

In [4]:
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.parse


class HtmlDocument(Document):

    def normalize_link(self, link):
        if link is not None and link[:4] == 'http':
            return link
        return urllib.parse.urljoin(self.url, link)
    
    def parse(self):
        #TODO extract plain text, images and links from the document
        self.get()
        soup = BeautifulSoup(self.content)
        self.anchors = []
        self.images = []
        self.text = ""

        a_links = soup.find_all("a") 
        for a in a_links:
            text = a.getText()
            link = a.get('href')
            self.anchors.append((text, self.normalize_link(link)))

        imgs = soup.find_all("img")
        for img in imgs:
            link = img.get('src')
            self.images.append(self.normalize_link(link))
        
        #TODO get the text using soup.getAll(text=true) and filter unwanted tags as ['style', 'script', 'head', 'title'] e.g. anything not visible
        text = soup.find("body").getText()
        self.text = text.replace('\n','').replace('\r','').strip() if type(text)==str else ""

### 1.3.1. Tests ###

In [5]:
doc = HtmlDocument("http://sprotasov.ru")
doc.get()
doc.parse()

assert "just few links" in doc.text, "Error parsing text"
assert "http://sprotasov.ru/images/gb.svg" in doc.images, "Error parsing images"
assert any(p[1] == "https://twitter.com/07C3" for p in doc.anchors), "Error parsing links"
print(doc.text)


File loaded: 2f3ba9f50c5510bce6a195f516b85dcf0aae989144395037ec0774dec5872b86
File loaded: 2f3ba9f50c5510bce6a195f516b85dcf0aae989144395037ec0774dec5872b86
Stanislav Protasov in just few links:Curriculum vitaeGoogle ScholarGitHubResearchGateПубликации в eLibraryFacebookLinkedInResearch with Stas telegram channelПодкаст "Происхождение видов": telegram, iTunes, RSSAutomatic testing system (source code)Книга "Давайте объясню: или зачем программисту математика"Материалы на ПостНаукеTwitter


## 1.4. [10] Document analysis ##
Complete the code for `HtmlDocumentTextData` class. Implement word and sentence splitting (use any method you can propose). 

**Criteria of success**: 
1. Your `get_word_stats()` method should return `Counter` object.
2. Don't forget to lowercase your words for counting.
3. Sentences should be obtained inside `<body>` tag only.

In [6]:
from collections import Counter

class HtmlDocumentTextData:
    
    def __init__(self, url):
        self.doc = HtmlDocument(url)
        self.doc.get()
        self.doc.parse()
    
    def get_sentences(self):
        #TODO implement sentence parser
        result = self.doc.text.split('.')
        return result
    
    def get_word_stats(self):
        #TODO return Counter object of the document, containing mapping {`word` -> count_in_doc}
        words =  [item.lower() for sublist in list(map(str.split,self.get_sentences())) for item in sublist] 
        # extracting words from sentences and flattening the list and lowercasing the words.
    
        return Counter(words)

### 1.4.1. Tests ###

In [7]:
doc = HtmlDocumentTextData("https://innopolis.university/")
print(doc.get_word_stats().most_common(10))
assert [x for x in doc.get_word_stats().most_common(10) if x[0] == 'иннополис'], 'иннополис should be among most common'

File loaded: a098b299ac4dd3ffe293ff4f93725462aef38ebd339d74e9bfafdea53a3abfd5
File loaded: a098b299ac4dd3ffe293ff4f93725462aef38ebd339d74e9bfafdea53a3abfd5
[('и', 59), ('в', 30), ('иннополис', 18), ('по', 16), ('2022', 15), ('на', 14), ('университет', 12), ('области', 10), ('с', 10), ('университета', 9)]


## 1.5. [15] Crawling ##

Method `crawl_generator()` is given starting url (`source`) and max depth of search. It should return a **generator** of `HtmlDocumentTextData` objects (return a document as soon as it is downloaded and parsed). You can benefit from `yield obj_name` python construction. Use `HtmlDocumentTextData.anchors` field to go deeper.

In [10]:
from queue import Queue


class Crawler:
    # def bfs(self):
    def _fetch(self,links):
        res = []
        for link in links:
            try:
                res.append(HtmlDocumentTextData(link))
            except:
                continue 
        return res

    def crawl_generator(self, source, depth=1):
        # TODO return real crawling results. Don't forget to process failures
        cur = self._fetch([source])
        result = set()
        if depth < 1:
            raise ValueError("Invalid Depth")
        for i in range(depth,0,-1):
            tmp = []
            for item in cur:
                result.add(item)
                tmp = [c[1] for c in item.doc.anchors]

            if not depth == 1 : cur = self._fetch(tmp)
        return list(result)



### 1.5. Tests ###

In [9]:
crawler = Crawler()
counter = Counter()

for c in crawler.crawl_generator("https://innopolis.university/en/",2):
    print(c.doc.url)
    if c.doc.url[-4:] in ('.pdf', '.mp3', '.avi', '.mp4', '.txt'):
        print("Skipping", c.doc.url)
        continue
    counter.update(c.get_word_stats())
    print(len(counter), "distinct word(s) so far")
    
print("Done")

print(counter.most_common(20))
assert [x for x in counter.most_common(20) if x[0] == 'innopolis'], 'innopolis sould be among most common'

File loaded: b1c10b8b82cfc2a140c0eb3e9dbfbfe00e1ff35f4a266c5ab112895a069ad96e
File loaded: b1c10b8b82cfc2a140c0eb3e9dbfbfe00e1ff35f4a266c5ab112895a069ad96e
File loaded: b1c10b8b82cfc2a140c0eb3e9dbfbfe00e1ff35f4a266c5ab112895a069ad96e
File loaded: b1c10b8b82cfc2a140c0eb3e9dbfbfe00e1ff35f4a266c5ab112895a069ad96e
File loaded: b1c10b8b82cfc2a140c0eb3e9dbfbfe00e1ff35f4a266c5ab112895a069ad96e
File loaded: b1c10b8b82cfc2a140c0eb3e9dbfbfe00e1ff35f4a266c5ab112895a069ad96e
File loaded: b1c10b8b82cfc2a140c0eb3e9dbfbfe00e1ff35f4a266c5ab112895a069ad96e
File loaded: b1c10b8b82cfc2a140c0eb3e9dbfbfe00e1ff35f4a266c5ab112895a069ad96e
File loaded: b1c10b8b82cfc2a140c0eb3e9dbfbfe00e1ff35f4a266c5ab112895a069ad96e
File loaded: b1c10b8b82cfc2a140c0eb3e9dbfbfe00e1ff35f4a266c5ab112895a069ad96e
File loaded: b1c10b8b82cfc2a140c0eb3e9dbfbfe00e1ff35f4a266c5ab112895a069ad96e
File loaded: b1c10b8b82cfc2a140c0eb3e9dbfbfe00e1ff35f4a266c5ab112895a069ad96e
File loaded: b1c10b8b82cfc2a140c0eb3e9dbfbfe00e1ff35f4a266c5ab11

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


File loaded: a68150a59ac36ca758d9d13ffb560ade83f2aea9707d5703c07b9b736dd5181a
File loaded: a68150a59ac36ca758d9d13ffb560ade83f2aea9707d5703c07b9b736dd5181a
File loaded: 9e43dcc3e3ef2d03704e12cbd315f9cb5b1d1770b9b56a974ee83f9c044f0163
File loaded: 9e43dcc3e3ef2d03704e12cbd315f9cb5b1d1770b9b56a974ee83f9c044f0163
File loaded: 7b241203abaac188ad973813b3935582093e269e1c793730b464d57d2fce26e1
File loaded: 7b241203abaac188ad973813b3935582093e269e1c793730b464d57d2fce26e1
File loaded: 5f45553c4897b1cdb53ec435cb5b2ef25675fce8632fe81bbf554ab05d2a9d5c
File loaded: 5f45553c4897b1cdb53ec435cb5b2ef25675fce8632fe81bbf554ab05d2a9d5c
File loaded: 22f2aae924a6d457284dfff57c3a2701b2f0f205b7ea6aadb5311cb8b905073a
File loaded: 22f2aae924a6d457284dfff57c3a2701b2f0f205b7ea6aadb5311cb8b905073a
File loaded: 151dd226e3aba697c639f3581ff949247f32583be67d37e4703542faf41d0c95
File loaded: 151dd226e3aba697c639f3581ff949247f32583be67d37e4703542faf41d0c95
File loaded: 0958bc95498c022f38de6e7d2b6778953c19a7ef4a09eba156d