# NB: This document contains master-level tasks

## 1. [M][15] Account the caching policy

Sometimes remote documents (especially when we speak about static content like `js` or `gif`) can swear that they will not change for some time. This is done by setting [Cache-Control response header](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Cache-Control).

In [1]:
import requests
response = requests.head('https://polyfill.io/v3/polyfill.min.js')

In [2]:
import datetime
def process_cache(cache, fetch_date):  
  """
    Function for checking cache expiry
    args:
      cache: The cache-control string returned by the response
      fetch_date: The date the response was gotten
    response: 
      returns false if
        1. no cache-control header
        2. the cache-control header contains either no-store or no-cache
        3. the date has expired
      otherwise returns true
  """
  if cache is None:
    print('No Cache Header')
    return False
  cache_arr = list(map(str.strip, cache.split(',')))
  if 'no-store' in cache_arr or 'no-cache' in cache_arr:
    print('Cache has no-store or no-cache field')
    return False
  if 'public' in  cache_arr:
    print('Cache-header has public')
    re_s_max_age = re.compile('^s-maxage.*')
    s_max_age_list = list(filter(re_s_max_age.match, cache_arr))
    if len(s_max_age_list) > 0:
      s_max_age = s_max_age_list[0].split('=')[1]
      dt = datetime.datetime.strptime(fetch_date, '%a, %d %b %Y %H:%M:%S GMT')
      expired_time = dt + datetime.timedelta(seconds=int(s_max_age))            
      if datetime.datetime.now() < expired_time:
        print("Cache hasn't expired ")
        return True
    else:
      re_max_age = re.compile('^max-age.*')
      max_age_list = list(filter(re_max_age.match, cache_arr))
      if len(max_age_list) > 0:
        max_age = max_age_list[0].split('=')[1]
        dt = datetime.datetime.strptime(fetch_date, '%a, %d %b %Y %H:%M:%S GMT')
        expired_time = dt + datetime.timedelta(seconds=int(max_age))
        if datetime.datetime.now() < expired_time:
          print("Cache hasn't expired ")
          return True
    return False
  elif 'private' in cache_arr:
      print('Cache-header has private')
      re_max_age = re.compile('^max-age.*')
      max_age_list = list(filter(re_max_age.match, cache_arr))
      if len(max_age_list) > 0:
        max_age = max_age_list[0].split('=')[1]
        dt = datetime.datetime.strptime(fetch_date, '%a, %d %b %Y %H:%M:%S GMT')
        expired_time = dt + datetime.timedelta(seconds=int(max_age))
        if datetime.datetime.now() < expired_time:
          print("Cache hasn't expired ")
          return True
  print("Cache has expired")
  return False

In [3]:
import argparse
import os
import re
import requests


def wget(url):
    """
      Function for requesting urls
      args:
        The URL to be requested
      response:
        The content of the response
    """
    def handleLoadToVar(resp):
      """
        Function for chunking the http response in order to manage system resources
        args:
          resp: The http response
        response:
          The chunked response
      """
      result = bytearray()
      resp.raise_for_status()      
      for chunk in resp.iter_content(chunk_size=8192): 
        result += chunk
      return result
    
    # allow redirects - in case file is relocated
    resp = requests.get(url, allow_redirects=True, stream = True)
    # this can also be 2xx, but for simplicity now we stick to 200
    # you can also check for `resp.ok`
    if resp.status_code != 200:
        print(resp.status_code, resp.reason, 'for', url)
        return
    
    # just to be cool and print something
    print(*[f"{key}: {value}" for key, value in resp.headers.items()], sep='\n')
    print()
    
    return handleLoadToVar(resp)

In [4]:
def wget_cache(url):
  """
      Function for requesting urls
      args:
        The URL to be requested
      response: 
        A tuple containing the content of the response, the cache-control header and the date header
    """
  resp = requests.get(url, allow_redirects=True, stream = True)   
  def handleLoadToVar(resp):
      result = bytearray()
      resp.raise_for_status()      
      for chunk in resp.iter_content(chunk_size=8192):           
        result += chunk
      return result
  print("Fetching file from online") 
  return (handleLoadToVar(resp), 
    resp.headers['Cache-Control'] if 'Cache-Control' in resp.headers else None,
     resp.headers['Date'] if 'Date' in resp.headers else datetime.datetime.now())

In [5]:
import requests
from urllib.parse import quote
import hashlib

class Document:
    
    def __init__(self, url):
        self.url = url
        
    def get(self):
        if not self.load():
            if not self.download():
                raise FileNotFoundError(self.url)
            else:
                self.persist()
    
    def download(self):      
        #TODO download self.url content, store it in self.content and return True in case of success
        try:  
          result = wget(self.url)  
          if(result is None):
            return False
          self.content = result
          return True
        except:
          return False
    
    def persist(self):
        filename = hashlib.sha256(self.url.encode()).hexdigest()
        with open(filename, "wb") as file:
          file.write(self.content)
            
    def load(self):  
        #TODO load content from hard drive, store it in self.content and return True in case of success    
        try:
          filename = hashlib.sha256(self.url.encode()).hexdigest()
          with open(filename, "rb") as file:
            self.content = file.read()
          return True
        except:
          return False

Please study the documentation and implement a descendant to a `Document` class, which will refresh the document in case of expired cache even if the file is already on the hard drive.

In [6]:
class CachedDocument(Document):
  """
    Overloads the document class by storing another file 
    with .cache extension to store necessary cache information
  """
  def download(self):
    try:       
      result, cache_data, date = wget_cache(self.url)
      if(result is None):
        return False
      self.content = result
      self.cache_data = cache_data
      self.fetch_date = date
      return True
    except Exception as e:
      print(e)
      return False

  def persist(self):
    Document.persist(self)        
    cacheFileName = hashlib.sha256(self.url.encode()).hexdigest() + '.cache'
    with open(cacheFileName, "w") as file:
      file.write(f'{self.cache_data},\n{self.fetch_date}')

  def load(self):  
    #TODO load content from hard drive, store it in self.content and return True in case of success    
    try:
      cacheFileName = hashlib.sha256(self.url.encode()).hexdigest() + '.cache'
      with open(cacheFileName, "r") as file:
        self.cache_data = file.readline()
        self.fetch_date = file.readline()
        if process_cache(self.cache_data, self.fetch_date):
          return Document.load(self)
        else:
          return False
      return True
    except:
      return False          

### Tests

Add logging to your code and show that your code behaves differently for documents with different caching policy.

In [7]:
import time

doc = CachedDocument('https://polyfill.io/v3/polyfill.min.js')
doc.get() 
time.sleep(2)
doc.get()
time.sleep(2)
doc.get()

doc = CachedDocument('https://yandex.ru/')
doc.get()
time.sleep(2)
doc.get()
time.sleep(2)
doc.get()

Cache-header has public
Cache hasn't expired 
Cache-header has public
Cache hasn't expired 
Cache-header has public
Cache hasn't expired 
Cache has expired
Fetching file from online
Cache has expired
Fetching file from online
Cache has expired
Fetching file from online


## 2. [M][35] Languages
Maybe you heard, that there are multiple languages in the world. European languages, like Russian and English, use similar puctuation, but even in this family there is ¡Spanish!

Other languages can use different punctiation rules, like **Arabic or [Thai](http://www.thai-language.com/ref/breaking-words)**.

Your task is to support (at least) three languages (English, Arabic, and Thai) tokenization in your `HtmlDocumentTextData` class descendant.

What should you do (acceptance criteria):
1. Use any language dection techniques, e.g. [langdetect](https://pypi.org/project/langdetect/).
2. Use language-specific tokenization tools, e.g. for [Thai](https://pythainlp.github.io/tutorials/notebooks/pythainlp_get_started.html#Tokenization-and-Segmentation) and [Arabic](https://github.com/CAMeL-Lab/camel_tools).
3. Use these pages to test your code: [1](https://www.bangkokair.com/tha/baggage-allowance) and [2](https://alfajr-news.net/details/%D9%85%D8%B4%D8%B1%D9%88%D8%B9-%D8%AF%D9%8A%D9%85%D9%88%D9%82%D8%B1%D8%A7%D8%B7%D9%8A-%D9%81%D9%8A-%D8%A7%D9%84%D9%83%D9%88%D9%86%D8%BA%D8%B1%D8%B3-%D8%A7%D9%84%D8%A3%D9%85%D8%B1%D9%8A%D9%83%D9%8A-%D9%84%D9%85%D8%B9%D8%A7%D9%82%D8%A8%D8%A9-%D8%A8%D9%88%D8%AA%D9%8A%D9%86).
4. Pass the tests.

In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.parse
import re

class HtmlDocument(CachedDocument):
    #TODO extract plain text, images and links from the document    

    def parse(self):
        def _preprocess_link(link):
          """
            Function for handling relative urls
            args:
              link: The urls to be processed
            response:
              The processed urls
          """
          if re.match("(?:^[a-z][a-z0-9+\.-]*:|\/\/)",link ):
            return link
          else:
            return urllib.parse.urljoin(self.url,link)

        def _get_anchors(dom):
          """
            Function for getting the urls and the corresponding tags in the dom
            args:
              dom: The dom 
            response:
              A list of all anchor links and names              
          """
          all_hrefs = dom.find_all('a', href=True)
          all_urls = set()
          return list(set((a.text,_preprocess_link(a['href'])) for a in all_hrefs))
        
        def _get_images(dom):
          """
            Function for getting the urls and the corresponding tags in the dom
            args:
              dom: The dom 
            response:
              A list of all the image sources              
          """          
          all_images= dom.find_all('img', src=True)
          all_src = set()          
          return list(set([_preprocess_link(img['src']) for img in all_images]))                  
        
        def tag_visible(element):
          """
            Function for checking if a html element is among the visible elements in the dom
            args:
              element: The HTML element to be checked
            response: A boolean specifying if an element is visible                                            
          """
          if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
              return False
          if isinstance(element, Comment):
              return False
          return True
        
        def _get_text(dom):
          """
            Function for getting visible texts in the dom
            args:
              dom: The dom 
            response: A string of all the visible texts             
          """
          texts = dom.findAll(text=True)
          visible_texts = filter(tag_visible, texts)
          return u" ".join(t.strip() for t in visible_texts)
        
        try:
          dom = BeautifulSoup(self.content.decode())  
          self.anchors = _get_anchors(dom)
          self.images = _get_images(dom)
          self.text = _get_text(dom)
        except Exception as e:
          print(e)
          pass

In [10]:
from collections import Counter
from nltk import tokenize

class HtmlDocumentTextData:
    
    def __init__(self, url):
        self.doc = HtmlDocument(url)
        try:
          self.doc.get()
          self.doc.parse()
        except FileNotFoundError:
          print("File Not Found")
    
    def get_sentences(self):
        #TODO implement sentence parser
        result = nltk.sent_tokenize(self.doc.text.strip())
        return result
    
    def get_word_stats(self):
        result = nltk.word_tokenize(self.doc.text.strip())
        #TODO return Counter object of the document, containing mapping {`word` -> count_in_doc}
        return Counter(map(str.lower,result))

In [11]:
pip install langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
pip install pythainlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [13]:
pip install camel-tools


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [14]:
from langdetect import detect
from pythainlp import word_tokenize
from camel_tools.tokenizers.word import simple_word_tokenize

class MultilingualHtmlDocumentTextData(HtmlDocumentTextData):
  def get_word_stats(self):
        lang = detect(self.doc.text.strip())
        if lang == 'th':
          result = word_tokenize(self.doc.text.strip(), keep_whitespace=False)
        elif lang == 'ar':
          result = simple_word_tokenize(self.doc.text.strip())
        else:
          result = nltk.word_tokenize(self.doc.text.strip())
        #TODO return Counter object of the document, containing mapping {`word` -> count_in_doc}
        return Counter(map(str.lower,result))
    #TODO your code here

### Tests

In [15]:
doc = MultilingualHtmlDocumentTextData("https://www.bangkokair.com/tha/baggage-allowance")
print(doc.get_word_stats().most_common(10))

doc = MultilingualHtmlDocumentTextData("https://alfajr-news.net/details/%D9%85%D8%B4%D8%B1%D9%88%D8%B9-%D8%AF%D9%8A%D9%85%D9%88%D9%82%D8%B1%D8%A7%D8%B7%D9%8A-%D9%81%D9%8A-%D8%A7%D9%84%D9%83%D9%88%D9%86%D8%BA%D8%B1%D8%B3-%D8%A7%D9%84%D8%A3%D9%85%D8%B1%D9%8A%D9%83%D9%8A-%D9%84%D9%85%D8%B9%D8%A7%D9%82")
print(doc.get_word_stats().most_common(10))

Cache has no-store or no-cache field
Fetching file from online
[('สัมภาระ', 34), ('เรา', 24), ('การ', 23), ('ที่', 21), ('กิโลกรัม', 21), ('ของ', 20), ('และ', 20), ('เดินทาง', 17), ('เที่ยวบิน', 16), ('บริการ', 16)]
Cache has expired
Fetching file from online
[('.', 7), (':', 6), ('-', 5), ('أخبار', 5), ('الفجر', 5), ('فن', 4), ('الأكثر', 4), ('قراءة', 4), ('أخترنا', 4), ('لك', 4)]
