# 反爬：代理伺服器/IP

* 了解「IP 黑/白名單」的反爬蟲機制
* 「IP 黑/白名單」反爬蟲的因應策略

## 作業目標

* 目前程式中的 proxy_ips 是手動輸入的，請根據 https://free-proxy-list.net/ 寫一個可自動化抓取可用 Proxy 的 proxy_ips。




In [13]:
from bs4 import BeautifulSoup
import requests
import random

In [14]:
proxy_ips = []

url = 'https://free-proxy-list.net/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
for tr in soup.find('tbody').find_all('tr')[:100]:
    proxy = tr.find_all('td')[0].text + ':' + tr.find_all('td')[1].text
    proxy_ips.append(proxy)
    
print(proxy_ips)

['131.221.32.180:8080', '200.89.178.241:80', '139.59.169.246:3128', '139.59.59.63:8080', '159.65.151.96:8080', '154.16.63.16:8080', '188.6.164.138:55042', '27.123.255.82:30029', '159.8.114.37:8123', '138.122.140.35:3128', '139.59.61.229:8080', '188.166.83.20:3128', '45.127.134.195:8080', '176.9.75.42:8080', '138.197.146.74:8080', '139.59.64.9:8080', '188.166.83.34:3128', '190.7.141.66:44945', '200.7.193.229:58218', '138.197.157.32:8080', '200.73.129.85:8888', '181.117.176.236:61358', '188.226.141.61:3128', '177.91.111.233:8080', '128.199.77.66:47503', '88.198.24.108:8080', '154.16.202.22:8080', '3.124.80.57:80', '144.217.163.138:8080', '169.57.157.148:8123', '78.62.214.242:60678', '125.25.80.39:42790', '207.154.231.213:8080', '82.196.11.105:8080', '103.105.77.21:8080', '94.101.141.245:80', '178.62.193.19:3128', '200.89.178.231:3128', '41.217.219.49:54302', '138.197.145.103:8080', '46.4.96.137:3128', '207.154.231.212:8080', '188.226.141.127:3128', '85.47.31.179:3128', '188.166.83.13:312

In [16]:
available_proxies = []

for i in range(100):
    ip = proxy_ips[i]
    print('Use', ip)
    try:
        resp = requests.get('http://ip.filefab.com/index.php',
                        proxies={'http': ip, 'https': ip}, timeout=10)
        soup = BeautifulSoup(resp.text, 'html5lib')
        print(soup.find('h1', id='ipd').text.strip())
        available_proxies.append(ip)
        if len(available_proxies) > 5:
            break
    except:
        print('Fail')
    
print(available_proxies)

Use 131.221.32.180:8080
Fail
Use 200.89.178.241:80
Fail
Use 139.59.169.246:3128
Your IP address: 139.59.169.246
Use 139.59.59.63:8080
Your IP address: 139.59.59.63
Use 159.65.151.96:8080
Your IP address: 159.65.151.96
Use 154.16.63.16:8080
Your IP address: 154.16.63.16
Use 188.6.164.138:55042
Fail
Use 27.123.255.82:30029
Fail
Use 159.8.114.37:8123
Fail
Use 138.122.140.35:3128
Your IP address: 138.122.140.35
Use 139.59.61.229:8080
Your IP address: 139.59.61.229
['139.59.169.246:3128', '139.59.59.63:8080', '159.65.151.96:8080', '154.16.63.16:8080', '138.122.140.35:3128', '139.59.61.229:8080']


In [18]:
# 將前面找出的可用代理伺服器，再發送至 https://httpbin.org/ip 查看是否代理（偷換 IP）成功

for ip in available_proxies:
    print('Use', ip)
    try:
        resp = requests.get('https://httpbin.org/ip',
                        proxies={'http': ip, 'https': ip}, timeout=10)
        print(resp.json())
    except:
        print('Fail')

Use 139.59.169.246:3128
Fail
Use 139.59.59.63:8080
Fail
Use 159.65.151.96:8080
Fail
Use 154.16.63.16:8080
Fail
Use 138.122.140.35:3128
Fail
Use 139.59.61.229:8080
Fail


In [15]:
for i in range(10):
    ip = random.choice(proxy_ips)
    print('Use', ip)
    try:
        resp = requests.get('http://ip.filefab.com/index.php',
                        proxies={'http': ip, 'https': ip}, timeout=10)
        soup = BeautifulSoup(resp.text, 'html5lib')
        print(soup.find('h1', id='ipd').text.strip())
    except:
        print('Fail')

Use 188.166.83.20:3128
Your IP address: 188.166.83.20
Use 190.152.125.250:43133
Fail
Use 154.16.202.22:8080
Your IP address: 154.16.202.22
Use 80.211.228.27:8080
Your IP address: 80.211.228.27
Use 41.217.219.49:54302
Fail
Use 139.59.109.156:3128
Your IP address: 139.59.109.156
Use 178.134.71.138:35942
Fail
Use 138.68.165.154:3128
Your IP address: 138.68.165.154
Use 207.154.231.216:8080
Your IP address: 207.154.231.216
Use 200.7.193.229:58218
Fail
