# Crawling

In [1]:
from crawler import download

from bs4 import BeautifulSoup
import requests
import time

In [2]:
def blogURL(url):
    return requests.compat.urlparse(url[1].endswith("tistory.com"))

In [3]:
def parseURL(seed):
    html = download("get", seed[0])
    dom = BeautifulSoup(html.text, "lxml")

    return [(requests.compat.urljoin(seed[0], tag["href"]), seed[1] + 1)
            for tag in dom.find_all("a")
            if seed[1] < 2 and tag.has_attr("href")
            and 'javascript' not in tag['href'] and len(tag["href"]) > 3
            # and blogURL(requests.compat.urljoin(seed[0], tag["href"]))
            ]

In [4]:
url = "https://search.daum.net/search"
html = download("get", url, params={'q': '박보영'})
dom = BeautifulSoup(html.text, "lxml")

## BFS Search

In [5]:
queue = list()
queue.extend([(_['href'], 1) for _ in dom.select("#blogColl a.f_link_b")
              if _.has_attr('href') and blogURL(_["href"])])

seen = list()

while queue:
    baseURL = queue.pop(0)  # 여기랑 밑에 줄 사이에 sleep을 줘야 할 것
    seen.append(baseURL)

    time.sleep(.25)
    linkList = parseURL(baseURL)
    for link in linkList:
        if link not in queue and list not in seen:
            queue.append(link)
#    queue.extend(linkList)
    print(f"Queue: {len(queue)}, Seed: {len(seen)}")

Queue: 47, Seed: 1
Queue: 64, Seed: 2
Queue: 88, Seed: 3
Queue: 95, Seed: 4
Queue: 94, Seed: 5
Queue: 93, Seed: 6
Queue: 92, Seed: 7
Queue: 91, Seed: 8
Queue: 90, Seed: 9
Queue: 89, Seed: 10
Queue: 88, Seed: 11
Queue: 87, Seed: 12
Queue: 86, Seed: 13
Queue: 85, Seed: 14
Queue: 84, Seed: 15
Queue: 83, Seed: 16
Queue: 82, Seed: 17
Queue: 81, Seed: 18
Queue: 80, Seed: 19
Queue: 79, Seed: 20
Queue: 78, Seed: 21
Queue: 77, Seed: 22
Queue: 76, Seed: 23
Queue: 75, Seed: 24
Queue: 74, Seed: 25
Queue: 73, Seed: 26
Queue: 72, Seed: 27
Queue: 71, Seed: 28
Queue: 70, Seed: 29
Queue: 69, Seed: 30
Queue: 68, Seed: 31
Queue: 67, Seed: 32
Queue: 66, Seed: 33
Queue: 65, Seed: 34
Queue: 64, Seed: 35
Queue: 63, Seed: 36
Queue: 62, Seed: 37
Queue: 61, Seed: 38
Queue: 60, Seed: 39
Queue: 59, Seed: 40
Queue: 58, Seed: 41
Queue: 57, Seed: 42
Queue: 56, Seed: 43
Queue: 55, Seed: 44
Queue: 54, Seed: 45
Queue: 53, Seed: 46
Queue: 52, Seed: 47
Queue: 51, Seed: 48
Queue: 50, Seed: 49
Queue: 49, Seed: 50
Queue: 48

In [6]:
def parseURL(seed):
    html = download("get", seed)
    dom = BeautifulSoup(html.text, "lxml")
    return [
        requests.compat.urljoin(html.request.url, tag["href"])
        for tag in dom.find_all("a")
        if tag.has_attr("href") and len(tag["href"]) > 3
    ]

## Focused Crawling with Database Tables
- Table 1: id(PK), path, param, seen, depth, inbound, date
- Table 2: id(PK), netloc(Net Location), date
- url: netloc + path + param

In [7]:
import sqlite3
conn = sqlite3.connect('bot.db')
cur = conn.cursor()

```sql
    CREATE TABLE table1(
        id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
        table2_id INTEGER NOT NULL,
        path TEXT NOT NULL,
        param TEXT NOT NULL,
        seen BOOLEAN DEFAULT FALSE,
        date TIMESTAMP DEFAULT CURRENT_TIMESTAMP NOT NULL
    );
    
    CREATE TABLE table2(
        id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
        netloc TEXT NOT NULL,
        date TIMESTAMP DEFAULT CURRENT_TIMESTAMP NOT NULL
    );
```

In [8]:
cur.executescript("""
    DROP TABLE IF EXISTS table1;
    CREATE TABLE table1(
        id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
        table2_id INTEGER NOT NULL,
        path TEXT NOT NULL,
        param TEXT NULL,
        depth INTEGER NOT NULL,
        seen BOOLEAN DEFAULT FALSE,
        inbound INTEGER NULL,
        date TIMESTAMP DEFAULT CURRENT_TIMESTAMP NOT NULL
    );

    DROP TABLE IF EXISTS table2;
    CREATE TABLE table2(
        id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
        netloc TEXT NOT NULL,
        date TIMESTAMP DEFAULT CURRENT_TIMESTAMP NOT NULL
    );
""")
conn.commit()

### Seed URLs --> DB Insert

In [9]:
url = 'https://www.google.com/search'
html = download('get', url, params={'q': '아이린'})
dom = BeautifulSoup(html.text, 'lxml')

urlList = [_.find_parent()['href'] for _ in dom.select(".LC20lb")]
for href in urlList:
    parse = requests.compat.urlparse(href)
    netloc, path, param, depth, inbound = '://'.join(
        parse[:2]), parse[2], parse[4], 1, 0
    #netloc, path, param, depth = parse.netloc, parse.path, parse.params, 0

    # DB Select Limit 0, 1
    cur.execute('SELECT id FROM table2 WHERE netloc=? LIMIT 0, 1;', (netloc, ))
    netlocId = cur.fetchone()
    if not netlocId:
        cur.execute('INSERT INTO table2(netloc) VALUES(?);', (netloc, ))
        conn.commit()
        cur.execute('SELECT id FROM table2 WHERE netloc=?;', (netloc, ))
        netlocId = cur.fetchone()
    cur.execute(
        'INSERT INTO table1(table2_id, path, param, depth, inbound) VALUES(?, ?, ?, ?, ?);',
        (netlocId[0], path, param, depth, inbound))
    conn.commit()
    # print(cur.lastrowid, result)

---

In [10]:
i = 0
while True:
    cur.execute("""
        SELECT table1.id, table2.netloc,
               table1.path, table1.param,
               table1.depth, table2.id
        FROM table1 
        JOIN table2 
            ON table1.table2_id = table2.id 
        WHERE table1.seen = FALSE and table1.depth < 3
        ORDER BY table1.date ASC 
        LIMIT 0, 1;
    """)
    seed = cur.fetchone()
    if not seed or i > 1000:
        break
    i += 1
    cur.execute(
        """
        UPDATE table1
        SET seen = TRUE
        WHERE id = ?;
    """, (seed[0], ))
    conn.commit()

    baseURL = f'{seed[1]+seed[2]}?'
    if seed[3]:
        baseURL = baseURL + f'{seed[3]}'

    print(baseURL)
    for href in parseURL(baseURL):
        parse = requests.compat.urlparse(href)
        netloc, path, param = '://'.join(parse[:2]), parse[2], parse[4]
        # netloc, path, param = parse.netloc, parse.path, parse.params

        # DB Select Limit 0, 1
        cur.execute('SELECT id FROM table2 WHERE netloc=? LIMIT 0, 1;',
                    (netloc, ))
        netlocId = cur.fetchone()
        if not netlocId:
            cur.execute('INSERT INTO table2(netloc) VALUES(?);', (netloc, ))
            conn.commit()
            cur.execute('SELECT id FROM table2 WHERE netloc=?;', (netloc, ))
            netlocId = cur.fetchone()
        cur.execute(
            'INSERT INTO table1(table2_id, path, param, depth, inbound) VALUES(?, ?, ?, ?, ?);',
            (netlocId[0], path, param, seed[4] + 1, seed[5]))
        conn.commit()
        # print(cur.lastrowid, result)

https://namu.wiki/w/%EC%95%84%EC%9D%B4%EB%A6%B0(%EB%A0%88%EB%93%9C%EB%B2%A8%EB%B2%B3)?
https://ko.wikipedia.org/wiki/%EC%95%84%EC%9D%B4%EB%A6%B0_(1991%EB%85%84)?
https://maeari33.tistory.com/5?
https://twitter.com/hashtag/%EC%95%84%EC%9D%B4%EB%A6%B0?
https://twitter.com/search/%23%EC%95%84%EC%9D%B4%EB%A6%B0?
https://ccnp.tistory.com/297?
https://www.pinterest.co.kr/dreamtree001/%EC%95%84%EC%9D%B4%EB%A6%B0/?
http://m.blog.naver.com/kyung3376in/220835785923?
https://gall.dcinside.com/board/lists?id=irene
https://namu.wiki/w/%EB%82%98%EB%AC%B4%EC%9C%84%ED%82%A4:%EB%8C%80%EB%AC%B8?
https://namu.wiki/RecentChanges?
https://namu.wiki/RecentDiscuss?
https://board.namu.wiki/?
https://namu.wiki/NeededPages?
https://namu.wiki/OrphanedPages?
https://namu.wiki/UncategorizedPages?
https://namu.wiki/OldPages?
https://namu.wiki/ShortestPages?
https://namu.wiki/LongestPages?
https://namu.wiki/BlockHistory?
https://namu.wiki/RandomPage?
https://namu.wiki/Upload?
https://namu.wiki/License?
https://namu.

https://namu.wiki/w/%EC%95%84%EC%9D%B4%EB%A6%B0(%EB%A0%88%EB%93%9C%EB%B2%A8%EB%B2%B3)?
https://namu.wiki/w/%EC%95%84%EC%9D%B4%EB%A6%B0(%EB%A0%88%EB%93%9C%EB%B2%A8%EB%B2%B3)?
http://gall.dcinside.com/board/lists/?id=irene
https://namu.wiki/w/%EC%95%84%EC%9D%B4%EB%A6%B0(%EB%A0%88%EB%93%9C%EB%B2%A8%EB%B2%B3)?
https://namu.wiki/w/%EC%95%84%EC%9D%B4%EB%A6%B0(%EB%A0%88%EB%93%9C%EB%B2%A8%EB%B2%B3)?
https://namu.wiki/w/%EC%95%84%EC%9D%B4%EB%A6%B0(%EB%A0%88%EB%93%9C%EB%B2%A8%EB%B2%B3)?
https://namu.wiki/w/%EC%95%84%EC%9D%B4%EB%A6%B0(%EB%A0%88%EB%93%9C%EB%B2%A8%EB%B2%B3)?


KeyboardInterrupt: 

In [11]:
cur.close()
conn.close()

## Page rank - inbound

```sql
SELECT inbound, COUNT(inbound)
FROM table1
WHERE table2_id = 2
GROUP BY inbound;
```

In [12]:
conn = sqlite3.connect('bot.db')
cur = conn.cursor()

In [17]:
cur.execute("""
    SELECT inbound, COUNT(inbound)
    FROM table1
    WHERE table2_id = 5
    GROUP BY inbound;
""")
cur.fetchall()

[(0, 1), (5, 54)]

In [15]:
cur.execute("""
    SELECT COUNT(inbound)
    FROM table1
    WHERE inbound = 1;
""")
cur.fetchall()

[(55999,)]