# 反爬：代理伺服器/IP

* 了解「IP 黑/白名單」的反爬蟲機制
* 「IP 黑/白名單」反爬蟲的因應策略

## 作業目標

* 目前程式中的 proxy_ips 是手動輸入的，請根據 https://free-proxy-list.net/ 寫一個可自動化抓取可用 Proxy 的 proxy_ips。




In [1]:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.common.keys import Keys
import requests
import re
import math
import pandas as pd
import random

In [2]:
proxy_url = "https://free-proxy-list.net/"
entries_per_page = '80'
proxy_ips = []

browser = webdriver.Chrome(executable_path='./Data/chromedriver')
browser.get(proxy_url)  # 打開瀏覽器並連到網頁
time.sleep(2)  # delay一段時間等待網頁更新完成

# 顯示設定每頁幾筆選項
browser.execute_script("document.getElementById('proxylisttable_length') \
                        .style.display='inline-block';")
WebDriverWait(browser,
              2).until(EC.visibility_of_element_located((By.ID, 'proxylisttable_length')))
    
# 設定為每頁80筆
options = Select(browser.find_element_by_xpath("//div[@id='proxylisttable_length'] \
                                                //select[@name='proxylisttable_length']"))
options.select_by_value(entries_per_page)

data = []

while True:
    time.sleep(5)  # delay一段時間等待網頁更新完成
    html = browser.page_source
    soup = BeautifulSoup(html, 'lxml')
    
    active_page = soup.find('li',
                            attrs={'class':'fg-button ui-button ui-state-default active'}
                           ).a.text

    # 直接用計算的方式取得總頁數
    if active_page == '1':
        entries_info = re.findall(r'\d+', soup.find('div', attrs={'id':'proxylisttable_info'}).text)
        page_total = str(math.ceil(int(entries_info[2]) / int(entries_per_page)))
        proxy_columns = soup.find('table',
                                  attrs={'class':'table table-striped table-bordered dataTable'}
                                 ).thead.find_all('th')
        columns = [name for ele in proxy_columns for name in ele]

    print('目前在爬取第{}頁/共{}頁...'.format(active_page, page_total))

    ips_list = soup.find('table',
                         attrs={'class':'table table-striped table-bordered dataTable'}
                        ).find_all('tr', attrs={'class':re.compile('odd|even')})

    # 擷取proxies
    for ips in ips_list:
        ip_info = []        
        for info in ips:
            ip_info.append(info.text)
        data.append(ip_info)
        proxy_ips.append(ip_info[0]+':'+ip_info[1])

    if active_page == page_total:
        print('沒有下一頁了...')
        break
    else:
        print('切換下一頁中...')
        browser.find_element_by_link_text('Next').click()
        continue

# 關閉瀏覽器
browser.quit();

# 列出全部proxy資料
pd.DataFrame(data, columns=columns)

目前在爬取第1頁/共4頁...
切換下一頁中...
目前在爬取第2頁/共4頁...
切換下一頁中...
目前在爬取第3頁/共4頁...
切換下一頁中...
目前在爬取第4頁/共4頁...
沒有下一頁了...


Unnamed: 0,IP Address,Port,Code,Country,Anonymity,Google,Https,Last Checked
0,180.183.126.125,8080,TH,Thailand,transparent,no,no,11 seconds ago
1,157.230.41.185,3128,SG,Singapore,transparent,no,no,11 seconds ago
2,110.77.180.89,8080,TH,Thailand,transparent,no,no,11 seconds ago
3,27.68.135.14,30199,VN,Vietnam,elite proxy,no,yes,11 seconds ago
4,177.130.63.245,37188,BR,Brazil,elite proxy,no,yes,2 minutes ago
...,...,...,...,...,...,...,...,...
295,47.75.11.94,8080,HK,Hong Kong,anonymous,no,no,21 minutes ago
296,82.196.11.105,8080,NL,Netherlands,anonymous,no,no,21 minutes ago
297,157.119.118.122,45653,IN,India,elite proxy,no,yes,21 minutes ago
298,199.247.0.229,34043,DE,Germany,elite proxy,no,yes,21 minutes ago


In [3]:
for i in range(10):
    ip = random.choice(proxy_ips)
    print('Use', ip)
    try:
        resp = requests.get('http://ip.filefab.com/index.php',
                            proxies={'http': 'http://' + ip,
                                     'https': 'https://' + ip},
                            timeout=10)
        soup = BeautifulSoup(resp.text, 'html5lib')
        print(soup.find('h1', id='ipd').text.strip())
    except:
        print('Fail')

Use 165.98.53.38:35332
Fail
Use 103.85.63.70:53281
Fail
Use 117.242.147.181:48258
Your IP address: 117.242.147.181
Use 103.26.54.94:8080
Fail
Use 196.216.215.29:56975
Fail
Use 197.159.23.174:39150
Fail
Use 103.9.188.229:36984
Fail
Use 45.115.173.205:3128
Fail
Use 125.25.45.167:52557
Your IP address: 125.25.45.167
Use 46.49.121.187:52101
Fail
