Naloga pajka:
1. HTTP downloader and renderer: To retrieve and render a web page.
2. Data extractor: Minimal functionalities to extract images and hyperlinks.
3. Duplicate detector: To detect already parsed pages.
4. URL frontier: A list of URLs waiting to be parsed.
5. Datastore: To store the data and additional metadata used by the crawler.

TO-DO 2-images exctraction, duplicate detector 

In [None]:
import concurrent.futures
import threading
import psycopg2

lock = threading.Lock()

def reset_db(conn):
    
    conn.autocommit = True
    cur = conn.cursor()
    cur.execute("DELETE FROM crawldb.image")
    cur.execute("DELETE FROM crawldb.page_data")
    cur.execute("DELETE FROM crawldb.link")
    cur.execute("DELETE FROM crawldb.page")
    cur.execute("DELETE FROM crawldb.site")
    conn.commit()
    cur.close()
    return
  
def update_site_locking(domain, sitemap, robotstxt,delay, conn):
    
    with lock:
        try:
            conn.autocommit = True
            cur = conn.cursor()
            cur.execute("INSERT INTO crawldb.site (domain, robots_content,sitemap_content,delay) VALUES (%s, %s, %s,%s) RETURNING id;",
            (domain, sitemap, robotstxt,delay))
            id = -1
            if cur.rowcount != 0:
                id = cur.fetchone()[0]

            conn.commit()
            cur.close()
            if id != -1:
                return id;
            else:
                print("Error with cur in update_site_locking!")
        except Exception as e:
            print("Error in update_site_locking: ", e)
            return 'err'

                    
    
                    
def update_page_locking(siteId, url,html_content, status_code, acc_time, page_type_code, conn):
    
    with lock:
        try:
            conn.autocommit = True
            cur = conn.cursor()
            cur.execute("INSERT INTO crawldb.page (site_id, url,html_content,http_status_code,accessed_time, page_type_code) VALUES (%s,%s,%s,%s,%s,%s) RETURNING id;",
            (siteId, url, html_content,status_code,acc_time,page_type_code))
            id = -1
            if cur.rowcount != 0:
                id = cur.fetchone()[0]

            conn.commit()
            cur.close()
            if id != -1:
                return id;
            else:
                print("Error with cur in update_page_locking!")
        except Exception as e:
            print("Error in update_page_locking: ", e)
            return 'err'
    
                    
def update_image_locking(image_data,pageId, conn):
    
    with lock:
        try:
            conn.autocommit = True
            cur = conn.cursor()
            for img in image_data:
                cur.execute("INSERT INTO crawldb.image (page_id,filename, content_type,data,accessed_time) VALUES (%s,%s,%s, %s, %s);", (pageId, img['filename'], img['content_type'],img['data'],img['accessed_time']))
                conn.commit()
            cur.close()
        except Exception as e:
            print("Error in update_image_locking: ", e)
            return 'err'
                    
    
                    
def update_page_data_locking(pageId, data_type_code, data, conn):
    
    with lock:
        try:
            conn.autocommit = True
            cur = conn.cursor()
            cur.execute("INSERT INTO crawldb.page_data (page_id,data_type_code, data) VALUES (%s, %s, %s) RETURNING id;",
            (pageId, data_type_code, data))
            id = -1
            if cur.rowcount != 0:
                id = cur.fetchone()[0]

            conn.commit()
            cur.close()
            if id != -1:
                return id;
            else:
                print("Error with cur in update_page_data_locking!")
        except Exception as e:
            print("Error in update_page_data_locking: ", e)
            return 'err'
                    
    
                    
                                        
def update_link_locking(from_page, to_page, conn):
    
    with lock:
        try:
            conn.autocommit = True
            cur = conn.cursor()
            cur.execute("INSERT INTO crawldb.link (from_page,to_page) VALUES (%s, %s);",
            (from_page, to_page))

            conn.commit()
            cur.close()

        except Exception as e:
            print("Error in update_link_locking: ", e)
            return 'err'
    
def get_domain_id_locking(domain, conn):

    with lock:
        try:
            conn.autocommit = True
            cur = conn.cursor()
            cur.execute("SELECT id FROM crawldb.site WHERE domain = %s",
            (domain,))
            id = None
            if cur.rowcount != 0:
                id = cur.fetchone()[0]

            conn.commit()
            cur.close()
            if id is not None:
                return id;
            else:
                return None
        except Exception as e:
            print("Error in get_domain_id_locking: ", e)
            return 'err'
                
                
def get_domain_robots_locking(domain, conn):

    with lock:
        try:
            conn.autocommit = True
            cur = conn.cursor()
            cur.execute("SELECT robots_content FROM crawldb.site WHERE id = %s",
            (domain,))
            robots = -1
            if cur.rowcount != 0:
                robots = cur.fetchone()[0]

            cur.execute("SELECT delay FROM crawldb.site WHERE id = %s",
            (domain,))
            delay = -1
            if cur.rowcount != 0:
                delay = cur.fetchone()[0]

            conn.commit()
            cur.close()
            if robots != -1 and delay != -1:
                return robots, delay
            else:
                print("Error with cur in get_domain_robots_locking!")
        except Exception as e:
            print("Error in get_domain_robots_locking: ", e)
            return 'err','err'
                 
                    
def get_last_time_locking(siteid, conn):

    with lock:
        try:
            conn.autocommit = True
            cur = conn.cursor()
            cur.execute("SELECT EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - S.accessed_time)) FROM crawldb.site S WHERE S.id = %s",
            (siteid,))
            time = -1
            if cur.rowcount != 0:
                time = cur.fetchone()[0]

            conn.commit()
            cur.close()
            if time != -1:
                return time
            else:
                print("Error with cur in get_last_time_locking!")
        except Exception as e:
            print("Error in get_last_time_locking: ", e)
            return 'err'

        
def update_last_time_locking(siteid,acc_time, conn):

    with lock:
        try:
            conn.autocommit = True
            cur = conn.cursor()
            cur.execute("UPDATE crawldb.site SET accessed_time = %s WHERE id = %s; ",
            (acc_time,siteid))
            conn.commit()
            cur.close()
        except Exception as e:
            print("Error in update_last_time_locking: ", e)
            return 'err'


In [None]:
import pandas as pd
import os
import io
from urllib.parse import urlparse
ua = 'User-agent'


def get_robots_url(url):
    domain_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
    robots_url = domain_url + '/robots.txt'
    return robots_url
 
def read_robots_txt(url):
    robot_url = get_robots_url(url)
    robot_file = os.popen(f'curl {robot_url}').read()
    return robot_file
 
def initialize_dict(url):
    robot_file = read_robots_txt(url)
    result_data_set = {ua:{}}
    for line in robot_file.split("\n"):
        if line.startswith(ua):
            result_data_set[ua].update({line.split(':')[1].strip():{}})
    keys = []
    for key in result_data_set[ua].keys():
        keys.append(key)
    return result_data_set, keys, robot_file

#def make_sitemaps(robots):
#    data = []
#    lines = str(robots).splitlines()
#    for line in lines:
#       # print(line)
#        if line.startswith('Sitemap:'):
#            split = line.split(':', maxsplit=1)
#            data.append(split[1].strip())            

#    return data
def parse_robot(url):
    idict = initialize_dict(url)
    result_data_set = idict[0]
    keys = idict[1]
    robot_file = idict[2]
    sitemaps=[] #idict[3]
    crawl_delay=5
    print_flag = False
    for i in range(len(keys)):
        if i <= len(keys)-2:
            end_str = keys[i+1]
        else:
            end_str = 'We are done'
 
        result_data_set[ua][keys[i]]['Disallow'] = []
        result_data_set[ua][keys[i]]['Allow'] = []
        for line in robot_file.split("\n"):
            if end_str in line:
                print_flag = False
            elif keys[i] in line:
                print_flag = True
            elif print_flag:
                if line.startswith('Disallow') or line.startswith('Allow'):
                    status = line.split(':')[0].strip()
                    val = line.split(':')[1].strip()
                    result_data_set[ua][keys[i]][status].append(val)
                if line.startswith('Crawl-delay:'):
                    split = line.split(':', maxsplit=1)
                    crawl_delay=int(split[1].strip())
                if line.startswith('Sitemap:'):
                    split = line.split(':', maxsplit=1)
                    sitemaps.append(split[1].strip())     
    #print("here" ,timetoReq)
    return result_data_set,sitemaps,crawl_delay

def robots_to_String(url):
    result_data_set,sitemaps,crawl_delay = parse_robot(url)
    ls = {ua:[],'Status':[],'Pattern':[]}
    for k,v in result_data_set.items():
        for v in result_data_set[k]:
            for key,value in result_data_set[k][v].items():
                for value in result_data_set[k][v][key]:
                    ls[ua].append(v)
                    ls['Status'].append(key)
                    ls['Pattern'].append(make_absolute(url,value))
                    #ls['Pattern'].append(value)
    robots_df = pd.DataFrame.from_dict(ls)
    return pd.DataFrame.to_string(robots_df),sitemaps,crawl_delay #robots_df

def getAllow_Dissalow(data): #rabi pandas DF
    data = io.StringIO(data)
    df= pd.read_csv(data, sep='\s+')
    if "User-agent" in df:
        df=df[(df["User-agent"]=="*") | (df["User-agent"]==ua)] ##vse ki niso moj uporabnik oz. *#lahko odstranim ker se me ne tičejo
        allowed=df[df['Status']=="Allow"].Pattern.tolist()
        dissaloved=df[df['Status']=="Disallow"].Pattern.tolist()
    else:
        allowed=[]
        dissaloved=[]
    return dissaloved,allowed

data,sitemaps,crawl_delay=robots_to_String("https://www.avvo.com/robots.txt") #klici to da dobis df
#print(sitemaps)
#print(crawl_delay)
dissallowed,allowed=getAllow_Dissalow(data)
#print(dissallowed)
#print(allowed)


['https://fortune.com/wp-admin/', 'https://fortune.com/sponsored/', 'https://fortune.com/feeds/', 
 'https://fortune.com/feed/', 'https://fortune.com/wp-login.php', 'https://fortune.com/wp-signup.php', 
 'https://fortune.com/press-this.php', 'https://fortune.com/remote-login.php', 'https://fortune.com/activate/',
 'https://fortune.com/cgi-bin/', 'https://fortune.com/mshots/v1/', 'https://fortune.com/next/', 
 'https://fortune.com/sponsored/', 'https://fortune.com/feeds/', 'https://fortune.com/feed/', 'https://fortune.com/wp-login.php', 'https://fortune.com/wp-signup.php', 'https://fortune.com/press-this.php', 'https://fortune.com/remote-login.php', 'https://fortune.com/activate/', 'https://fortune.com/cgi-bin/', 'https://fortune.com/mshots/v1/', 'https://fortune.com/next/']
['https://fortune.com/wp-admin/admin-ajax.php']

V spodnjem delu kode so metode za pridobivanja sitemapov, potrebno je dobiti .xml naslov iz trenutnega linka - kliči get_sitemap(robots.tx), ki iz robots txt pridobi sitemap.xml, spodnje  funkcije se sprehodijo
1. poženi urlSitemap=get_site_map(robots.txt)
2. get_all_urls_siteMap(urlSitemap)

Moj naslov: WEB_DRIVER_LOCATION = "C:/Work/Magisterij_1_leto/2.semester/ekstrakcijaSplet/Nal1/chromedriver.exe"
Juretov naslov: C:/Users/Pirk/Desktop/faks-mag/ekstrakcija/chromedriver.exe

In [None]:
import pandas as pd
import urllib.request
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import xmltodict

def get_sitemaps(url): #find all sitemaps of sitemap from robot.txt
    """Scrapes an XML sitemap from the provided URL and returns XML source.
    Args:
        url (string): Fully qualified URL pointing to XML sitemap.
    Returns:
        xml (string): XML source of scraped sitemap.
    """
    response = urllib.request.urlopen(url)
    xml = BeautifulSoup(response, 
                         'lxml-xml', 
                         from_encoding=response.info().get_param('charset'))
    return xml

def get_sitemap_type(xml):
    """Parse XML source and returns the type of sitemap.
    Args:
        xml (string): Source code of XML sitemap.
    Returns:
        sitemap_type (string): Type of sitemap (sitemap, sitemapindex, or None).
    """
    sitemapindex = xml.find_all('sitemapindex')
    sitemap = xml.find_all('urlset')
    #print(sitemap)
    if sitemapindex:
        return 'sitemapindex' #vsebujejo linke na otroke
    elif sitemap:
        return 'urlset' #direktni linki
    else:
        return
    
def get_child_sitemaps(xml):
    """Return a list of child sitemaps present in a XML sitemap file.
    Args:
        xml (string): XML source of sitemap. 
    Returns:
        sitemaps (list): Python list of XML sitemap URLs.
    """
    sitemaps = xml.find_all("sitemap")
    output = []
    for sitemap in sitemaps:
        output.append(sitemap.findNext("loc").text)
  
    return output
def sitemap_to_dataframe(xml, name=None, data=None, verbose=False):
    """Read an XML sitemap into a Pandas dataframe. 

    Args:
        xml (string): XML source of sitemap. 
        name (optional): Optional name for sitemap parsed.
        verbose (boolean, optional): Set to True to monitor progress.

    Returns:
        dataframe: Pandas dataframe of XML sitemap content. 
    """

    df = pd.DataFrame(columns=['loc', 'changefreq', 'priority', 'domain', 'sitemap_name'])

    urls = xml.find_all("url")
  
    for url in urls:

        if xml.find("loc"):
            loc = url.findNext("loc").text
            parsed_uri = urlparse(loc)
            domain = '{uri.netloc}'.format(uri=parsed_uri)
        else:
            loc = ''
            domain = ''

        if xml.find("changefreq"):
            changefreq = url.findNext("changefreq").text
        else:
            changefreq = ''

        if xml.find("priority"):
            priority = url.findNext("priority").text
        else:
            priority = ''

        if name:
            sitemap_name = name
        else:
            sitemap_name = ''
              
        row = {
            'domain': domain,
            'loc': loc,
            'changefreq': changefreq,
            'priority': priority,
            'sitemap_name': sitemap_name,
        }

        if verbose:
            #print(row)

        df = df.append(row, ignore_index=True)
    return df
def get_all_urls_SiteMap(url): #provide xml of a site , from robot.txt ,... 
    """Return a dataframe containing all of the URLs from a site's XML sitemaps.
    Args:
        url (string): URL of site's XML sitemap. Usually located at /sitemap.xml
    Returns:
        df (dataframe): Pandas dataframe containing all sitemap content. 

    """  
    xml = get_sitemaps(url)
    sitemap_type = get_sitemap_type(xml)
    if sitemap_type =='sitemapindex':
        sitemaps = get_child_sitemaps(xml)
    else:
        sitemaps = [url]
    df = pd.DataFrame(columns=['loc', 'changefreq', 'priority', 'domain', 'sitemap_name'])
    for sitemap in sitemaps:
        #print(sitemap)
        sitemap_xml = get_sitemaps(sitemap) # ce želimo imeti vse povezave sitemapov
        #  sitemaps_all.append(sitemap_xml) 
        df_sitemap = sitemap_to_dataframe(sitemap_xml, name=sitemap)
        #print(sitemap_xml)
        #print(df_sitemap)
        #print("end")
        df = pd.concat([df, df_sitemap], ignore_index=True)
        #print(sitemap)
        #file = urllib.request.urlopen(sitemap)  #odpri xml in pridobi podatke
        #data = file.read()
        #file.close()
        #data = xmltodict.parse(data)
        #print(data)
    return df

url="https://www.gov.si/sitemap.xml" #dobimo ga recimo z robots.txt
dataFrame = get_all_urls_SiteMap(url)
#print(dataFrame.head())
#print(dataFrame.sitemap_name.value_counts())
xml=get_sitemap(url)
#sitemap_type = get_sitemap_type(xml)
#if(sitemap_type=="sitemapindex"): #če je 
    #child_sitemaps = get_child_sitemaps(xml)
    #print(child_sitemaps)


In [None]:
import requests
from lxml import html
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import queue
import re
import requests
from urllib.parse import urljoin
from urllib.parse import urlparse
from PIL import Image
from url_normalize import url_normalize
from urllib.parse import urldefrag
import hashlib
import datetime
from threading import Thread
from time import sleep
import json
import schedule
#import magic


visited_urls = []
visited = {}
domains = {}
frontier = queue.Queue()
#WEB_DRIVER_LOCATION = "C:/Users/Pirk/Desktop/faks-mag/ekstrakcija/chromedriver.exe" #jure
#WEB_DRIVER_LOCATION = "C:/Work/Magisterij_1_leto/2.semester/ekstrakcijaSplet/Nal1/chromedriver.exe" #matjaž
WEB_DRIVER_LOCATION = "C:/Users/miham/Documents/Faks/IEPS/chromedriver.exe" #Miha
FRONTIER_LOCATION = "C:/Users/miham/Documents/GitHub/IESP-1/PA1/frontier.txt" #Miha
DOMAIN_LOCATION = "C:/Users/miham/Documents/GitHub/IESP-1/PA1/domains.txt" #Miha
VISITED_LOCATION = "C:/Users/miham/Documents/GitHub/IESP-1/PA1/visited.txt" #Miha
TIMEOUT = 5
#url =  "https://www.gov.si/"
sha256 = hashlib.sha256()

def is_absolute(link):
    return bool(urlparse(link).netloc)

def is_link(link):
    if (len(re.findall('.:\/\/.',link))>0):
        return True
    return False
    #logika - preveri če je stvar valid link ki ga damo v frontier
    
def make_absolute(baselink,link):
    
    if (True != is_absolute(link)):
        return(urljoin(baselink,link))

    elif(link[0] == '/' and link[1] == '/'):
        return 'https:' + link

    return link
    #logika - prepozna če je link relativen inga spremeni v absolutnega

def get_robotstxt(link):
    chrome_options = Options()
    chrome_options.add_argument("user-agent=fri-ieps-TEST")
    chrome_options.add_argument('headless')
    driver = webdriver.Chrome(WEB_DRIVER_LOCATION, options=chrome_options)
    driver.get(link + '/robots.txt')
    time.sleep(TIMEOUT)
    html = driver.page_source
    bsObj = BeautifulSoup(html, 'html.parser')
    robots = bsObj.find('pre').contents
    return robots

def get_sitemap(robotstxt):
    robotarray = robotstxt.split()
    if("Sitemap:" in robotarray):
        index = robotarray.index("Sitemap:")
        return robotarray[index+1]
    return None

def getsitemapContext(url):
    site = url.find_all("sitemap")

def checkduplicate(html):
    #print(hashlib.sha224(str(html).encode("utf-8")).hexdigest())
    
    if (hashlib.sha224(str(html).encode("utf-8")).hexdigest() in visited):
        return visited[hashlib.sha224(str(html).encode("utf-8")).hexdigest()]
    else:
        return -1
    
def get_ending(url):
    stays_same = ['pdf', 'doc', 'docx', 'ppt', 'pptx']
    
    split_url = url.split('.')
    if(len(split_url)>0):
        last = split_url[-1]
        if last in stays_same:
            return last
        else:
            return 'html'
    else:
        return 'drop'

    
def isallowed(url, dissaloved):

    if(url == None):
        return False
    
    if('gov.si' not in url):
        return False
    
    if('mailto' in url):
        return False
    
    if (len(re.findall('^.*tel:\d{1,9}.*$',url))>0):
        return False
    
    for link in dissaloved:
        if(link == url):
            return False
    return True
    
def crawl_site(url,siteid,dissaloved, allowed, delay, conn):

    if(delay is None):
        TIMEOUT = 5
    elif(delay == -1):
        TIMEOUT = 5
    else:
        TIMEOUT = delay
            
    if url not in visited_urls:
        with lock:
            visited_urls.append(url)

        pagetype = requests.head(url).headers['content-type']
        if('html' not in pagetype):
            return
            
            


        time_diff = get_last_time_locking(siteid,conn);
        print('\n')
        print(time_diff,'AAAAAAAAAAAAAAAAAA')
        if(time_diff is None):
            time.sleep(5)
            print("ni nastavljen last time")
        
        elif(int(time_diff) < int(TIMEOUT)):
            print(int(TIMEOUT)-int(time_diff),"spim toliko casa")
            time.sleep(int(TIMEOUT)-int(time_diff))
        
            
        WEB_PAGE_ADDRESS = url
        baseurl = re.match('^.+?[^\/:](?=[?\/]|$)', url).group(0)
        #print(baseurl,'baseurl')
        acc_time = datetime.datetime.fromtimestamp(time.time())

        response = requests.get(WEB_PAGE_ADDRESS)
        statusCode = response.status_code
        update_last_time_locking(siteid,acc_time,conn)
        
        if(statusCode >= 300):
            update_page_locking(siteid,  urldefrag(url_normalize(url))[0],'', statusCode, acc_time, 'HTML', conn)
            return
    
             
        
        chrome_options = Options()
        chrome_options.add_argument("user-agent=fri-ieps-man_shrugging")
        chrome_options.add_argument('headless')
        driver = webdriver.Chrome(WEB_DRIVER_LOCATION, options=chrome_options)
        try:
            driver.get(WEB_PAGE_ADDRESS)
        except Exception as e:
            print("Error in update_link_locking: ", e)
            return
            


        time.sleep(TIMEOUT)
        


        html = driver.page_source       
        isVisited = checkduplicate(html)

        if(isVisited != -1):
            pageid = update_page_locking(siteid, urldefrag(url_normalize(url))[0],'', statusCode, acc_time, 'DUPLICATE', conn)
            if(pageid == 'err'):
                return
            update_link_locking(isVisited, pageid, conn)
            return

        
        bsObj = BeautifulSoup(html, 'html.parser')
        links = bsObj.find_all('a', href=True)
        img_tags = bsObj.findAll('img')


            
            
            
        
        page_type_code = ""
        #print(url_normalize(url))
        head = bsObj.find('head').contents
        if head is None:
            page_type_code = "BINARY"
        else:
            page_type_code = "HTML"
            
        
        pageid = update_page_locking(siteid, urldefrag(url_normalize(url))[0],str(html), str(statusCode), acc_time, page_type_code, conn)
        if(pageid == 'err'):
            return
        
        
        with lock:
            visited[hashlib.sha224(str(html).encode("utf-8")).hexdigest()] = pageid
            
        img_urls = [img['src'] for img in img_tags]
        img_db_objs = []    
        for img_url in img_urls:
            split_url = img_url.split('/')
            name = split_url[-1]
            filetype = name[-4:]
            name = name[:-4]
            data = img_url
            accessed_time = time.time()

            img_db = {
                'filename': name,
                'content_type': filetype,
                'data': data,
                'accessed_time': acc_time
            }

            img_db_objs.append(img_db)

        update_image_locking(img_db_objs,pageid, conn)
            
        for link in links:
            if (len(link.attrs['href'])>0 and link.attrs['href'][0] != '#'):
                absolute_link = make_absolute(baseurl,link.attrs['href'])
                ending = get_ending(absolute_link)
                if(ending == 'html'):
                    if(isallowed(absolute_link,dissaloved)):
                        with lock:
                            frontier.put(absolute_link)
                elif(ending == 'drop'):
                    continue
                else: 
                    #print("TU SE NEKE ZAPISE V BAZO")
                    update_page_data_locking(pageid, ending.upper(), '',conn)
                    #zapisi v bazo
        
       
        
def pajek(url, conn):
    
    url = frontier.get()
    domain = re.match('^.+?[^\/:](?=[?\/]|$)', url).group(0)
    delay = -1
    siteid = '666'
    if domain in visited_urls:

        baseid = get_domain_id_locking(str(domain), conn)
        robotsstr,delay = get_domain_robots_locking(baseid, conn)
        dissaloved,allowed = getAllow_Dissalow(robotsstr)
        siteid = baseid
        print(baseid, "id v bazo")
    else:
        robotsstr,sitemap,delay = robots_to_String(url)
        siteid = update_site_locking(domain,robotsstr,str(sitemap),delay, conn)
        dissaloved,allowed = getAllow_Dissalow(robotsstr)
        

    if siteid == None:
        return
    
      
    crawl_site(url, siteid, dissaloved, allowed, delay , conn)
    return



def run(max_threads, conn):
    threads = []
    print("IN RUN")
    if conn.closed == 1:
        conn = psycopg2.connect(host="localhost",dbname='postgres', user="postgres", password="admin")

    for i in range(0, max_threads):
        url = frontier.get()
        print(url, "from thread", i)
        t = Thread(target=pajek, args=(url, conn,))
        threads.append(t)
        t.start()
        
        
    
    for t in threads:
        t.join()
    
    #print("DONE", len(threads))
    threads.clear()
    return
    
def init(conn):
    print("IN INIT")
    #max_threads = 20;
    while True:
        max_threads = 20;
        schedule.run_pending() #checks if it should run any scheduled tasks
        if(frontier.qsize()<max_threads):
            max_threads = frontier.qsize()
            print("MAX THREADS: ", max_threads)
            if(max_threads==0):
                print('PRAZNA VRSTA')
        try:
            run(max_threads, conn)
        except e:
            print("ERROR IN INIT: ", e)
    return
    

def save_data():
    print("SAVING...")
    frontier_list = list(frontier.queue)
    with open(FRONTIER_LOCATION, 'w') as f:
        for url in frontier_list:
            f.write('%s\n' % url)
        f.close()

    with open(DOMAIN_LOCATION, 'w') as d:
        #print(domains)
        d.write(json.dumps(domains))
        d.close()

    with open(VISITED_LOCATION, 'w') as v:
        #print(visited)
        v.write(json.dumps(visited))
        v.close()

schedule.every().hour.do(save_data) #schedules to run save_data every hour

#check for frontier.txt, load it into frontier queue
with open(FRONTIER_LOCATION, 'r') as f:
    with frontier.mutex:
        frontier.queue.clear()  #ce je slucajno ostalo kaj v frontierju, za vsak slucaj
        urls = f.readlines()
    for url in urls:
        frontier.put(url.strip()) #ponovno dodajanje
    f.close()

#loads domains and visited dictionaries
with open(DOMAIN_LOCATION, 'r') as d:
    data = d.read()
    domains = json.loads(data)
    d.close()

with open(VISITED_LOCATION, 'r') as v:
    data = v.read()
    visited = json.loads(data)
    v.close()

frontier_list = list(frontier.queue)

print("FRONTIER: ", frontier_list)
print("DOMAINS: ", domains)
print("VISITED: ", visited)

conn = psycopg2.connect(host="83.212.127.54",dbname='crawler', user="test", password="fricrawl")
#reset_db(conn)
#print('reset')
init(conn)