# 爬取Mobile 01的機車遊記頁面

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

## 找到總共有幾頁

In [None]:
html = urlopen("https://www.mobile01.com/topiclist.php?f=416&p=1")
soup = BeautifulSoup(html,"html")
pages = soup.find_all('div',{"class":"pagination"})
pages = pages[1].find_all('a')
pages = pages[-1].text
pages = int(pages)

## 找到要的資料

In [None]:
topic = []
Popularity = []
reply_amount = []
author = []
issuing_time = []
reply_name = []
reply_time = []
link = []

In [None]:
for page in range (pages):
    html = urlopen("https://www.mobile01.com/topiclist.php?f=416&p="+str(page+1))
    soup = BeautifulSoup(html,"html")
    data = soup.find('div',{"class":"tablelist forumlist"})
    data = data.find_all('tr')
    for point in data[1:]:
        topic.append(point.find('span',{"class":"subject-text"}).text.split('»')[0])
        Popularity.append(point.find('a').get('title').split(' ')[1])
        reply_amount.append(point.find('td',{"class":"reply"}).text)
        author.append(point.find_all('p')[1].text)
        issuing_time.append(point.find_all('p')[0].text)
        reply_name.append(point.find_all('p')[3].text)
        reply_time.append(point.find_all('p')[2].text)
        link.append('https://www.mobile01.com/'+point.find('a').get('href'))

## 存成CSV

In [None]:
point_df = pd.DataFrame({"主題":topic,"人氣":Popularity,"作者":author,
                         "發文時間":issuing_time,"最新回應者":reply_name,
                         "最新回應時間":reply_time,"回應人數":reply_amount,
                         "連結":link},columns = ["主題", "人氣", "作者","發文時間",
                                               "最新回應者","最新回應時間","回應人數",
                                               "連結"])

In [None]:
point_df.to_csv("travel.csv", index=False,encoding="utf-8")

# 機車遊記子板？似乎太少了,試試把整個機車板都爬下來

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import threading
import time
import queue
import csv

## 找到所有子版的網址

In [None]:
board_link = []

html = urlopen("https://www.mobile01.com/forumlist.php?f=29")
soup = BeautifulSoup(html,"lxml")
soup = soup.find('div',{"class":"tablelist"})
soup = soup.find_all('td',{"class":"forumname"})
for row in soup:
    board_link.append('https://www.mobile01.com/' + row.find('a').get('href'))

## 找到總共有幾頁

In [None]:
def get_pages(link):
    html = urlopen(link)
    soup = BeautifulSoup(html,"lxml")
    pages = soup.find_all('div',{"class":"pagination"})
    pages = pages[1].find_all('a')
    pages = pages[-1].text
    pages = int(pages)
    return pages

## 把所有要爬的網頁網址建成Queue

In [None]:
link_queue = queue.Queue()
for board in board_link:
    for page in range (get_pages(board)):
        link_queue.put(board+'&p='+str(page+1))

## 找到要的資料

In [None]:
def get_data(page_link):
    topic = []
    Popularity = []
    reply_amount = []
    author = []
    issuing_time = []
    reply_name = []
    reply_time = []
    link = []
    
    html = urlopen(page_link)
    soup = BeautifulSoup(html,"lxml")
    data = soup.find('div',{"class":"tablelist forumlist"})
    data = data.find_all('tr')
    for point in data[1:]:
        topic.append(point.find('span',{"class":"subject-text"}).text.split('»')[0])
        Popularity.append(point.find('a').get('title').split(' ')[1])
        reply_amount.append(point.find('td',{"class":"reply"}).text)
        author.append(point.find_all('p')[1].text)
        issuing_time.append(point.find_all('p')[0].text)
        reply_name.append(point.find_all('p')[3].text)
        reply_time.append(point.find_all('p')[2].text)
        link.append('https://www.mobile01.com/'+point.find('a').get('href'))
        
    return topic,Popularity,reply_amount,author,issuing_time,reply_name,reply_time,link

## 寫入CSV

In [None]:
def write_csv(topic,Popularity,reply_amount,author,issuing_time,reply_name,reply_time,link):
    with open('bike.csv', 'a+', newline='') as csvfile:
        writer = csv.writer(csvfile)
        for i in range (len(topic)):        
            writer.writerow([topic[i],Popularity[i],reply_amount[i],author[i],issuing_time[i],
                             reply_name[i],reply_time[i],link[i]])

## Thread建成class

In [None]:
class Worker(threading.Thread):
    def __init__(self, queue,lock):
        threading.Thread.__init__(self)
        self.queue = queue
        self.lock = lock
        
    def run(self):
        while self.queue.qsize() > 0:
            link = self.queue.get()
            topic,Popularity,reply_amount,author,issuing_time,reply_name,reply_time,link = get_data(link)
            lock.acquire()
            write_csv(topic,Popularity,reply_amount,author,issuing_time,reply_name,reply_time,link)
            self.lock.release()

## 生成Thread

In [None]:
workers = []
lock = threading.Lock()

for i in range (1000):
    workers.append(Worker(link_queue,lock))

## 使用多執行緒爬取

In [None]:
for worker in workers:
    worker.start()
    time.sleep(1)
for worker in workers:
    worker.join()

# Python使用MultiThread似乎效率不彰,來試試MultiProcess吧

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import multiprocessing as mp
import queue
import time

## 找到所有子版的網址

In [2]:
board_link = []

html = urlopen("https://www.mobile01.com/forumlist.php?f=29")
soup = BeautifulSoup(html,"lxml")
soup = soup.find('div',{"class":"tablelist"})
soup = soup.find_all('td',{"class":"forumname"})
for row in soup:
    board_link.append('https://www.mobile01.com/' + row.find('a').get('href'))

## 找到總共有幾頁

In [3]:
def get_pages(link):
    html = urlopen(link)
    soup = BeautifulSoup(html,"lxml")
    pages = soup.find_all('div',{"class":"pagination"})
    pages = pages[1].find_all('a')
    pages = pages[-1].text
    pages = int(pages)
    return pages

## 把所有要爬的網頁網址建成List

In [4]:
link_list = []
for board in board_link:
    for page in range (get_pages(board)):
        link_list.append(board+'&p='+str(page+1))

## 找到要的資料

In [5]:
def get_data(page_link):
    topic = []
    Popularity = []
    reply_amount = []
    author = []
    issuing_time = []
    reply_name = []
    reply_time = []
    link = []
    try:
        html = urlopen(page_link)
        soup = BeautifulSoup(html,"lxml")
        data = soup.find('div',{"class":"tablelist forumlist"})
        data = data.find_all('tr')
        for point in data[1:]:
            topic.append(point.find('span',{"class":"subject-text"}).text.split('»')[0])
            Popularity.append(point.find('a').get('title').split(' ')[1])
            reply_amount.append(point.find('td',{"class":"reply"}).text)
            author.append(point.find_all('p')[1].text)
            issuing_time.append(point.find_all('p')[0].text)
            reply_name.append(point.find_all('p')[3].text)
            reply_time.append(point.find_all('p')[2].text)
            link.append('https://www.mobile01.com/'+point.find('a').get('href'))

        return pd.DataFrame({"主題":topic,"人氣":Popularity,"作者":author,
                             "發文時間":issuing_time,"最新回應者":reply_name,
                             "最新回應時間":reply_time,"回應人數":reply_amount,
                             "連結":link},columns = ["主題", "人氣", "作者","發文時間",
                                                   "最新回應者","最新回應時間","回應人數",
                                                   "連結"])
    except:
        print('error')

## 用MultiProcess的pool來爬

In [6]:
def multicore(link_list):
    pool = mp.Pool()
    data = pool.map(get_data, link_list)
    return data

## 看看速度如何吧

In [7]:
st = time.time()
data = multicore(link_list)
st1 = time.time()
print ('總花費時間為：' + str((st1-st)/60))

總花費時間為：6.624648344516754


## 生成CSV

In [8]:
data[0].to_csv("bike.csv", index=False,mode = 'w', encoding="utf-8")

In [9]:
for row in data[1:]:
    row.to_csv("bike.csv", index=False,mode = 'a+', header = False, encoding="utf-8")

In [3]:
before = time.time()

while True:
    now = time.time()
    if ((now-before)/60) >= 1:
        print ('1 second')
    before = now

KeyboardInterrupt: 