From dbba5376a141470fe0e15f12b5026f0d5599f34e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=94=E5=BA=86=E6=89=8D=E4=B8=A8=E9=9D=99=E8=A7=85?= <1016903103@qq.com> Date: Tue, 14 Jul 2020 04:28:42 +0800 Subject: [PATCH 01/65] Update build.yml --- .github/workflows/build.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 183e5a79..fe8aa55e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -17,8 +17,6 @@ jobs: run: docker login -u germey -p ${{ secrets.DOCKERHUB_LOGIN_PASSWORD }} - name: Build the Docker Image run: docker-compose build - - name: Push the Docker Image - run: docker-compose push - name: Tag and Push Master Version run: | docker tag germey/proxypool germey/proxypool:master From d2320ce04e47c2be05c83747d4ac84e2bd14c4a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=94=E5=BA=86=E6=89=8D=E4=B8=A8=E9=9D=99=E8=A7=85?= <1016903103@qq.com> Date: Tue, 14 Jul 2020 11:09:20 +0800 Subject: [PATCH 02/65] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ee7c108b..266fff58 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ ## 运行示例 -API Server 可以见[部署样例](https://universal.proxypool.cuiqingcai.com/),随机代理[取用地址](https://universal.proxypool.cuiqingcai.com/random),代理源比较少,仅供演示。 +API Server 可以见[部署样例](https://proxypool.scrape.center/),随机代理[取用地址](https://proxypool.scrape.center/random),代理源比较少,仅供演示。 本样例为 GitHub Actions + Kubernetes 自动部署 master 分支代码结果。 From 16c817a380d7c317ef56b89cafa03480a833ab01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=94=E5=BA=86=E6=89=8D=E4=B8=A8=E9=9D=99=E8=A7=85?= <1016903103@qq.com> Date: Sun, 1 Nov 2020 00:20:24 +0800 Subject: [PATCH 03/65] Update README.md --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 266fff58..74ad0307 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,13 @@ proxypool | 2020-02-19 17:09:46,596 INFO success: tester entered RUNNING stat 这时候访问 [http://localhost:5555/random](http://localhost:5555/random) 即可获取一个随机可用代理。 +如果下载速度特别慢,可以自行修改 Dockerfile,修改改行: + +``` +- RUN pip install -r requirements.txt ++ RUN pip install -r requirements.txt -i https://pypi.douban.com/simple +``` + ## 常规方式运行 如果不使用 Docker 运行,配置好 Python、Redis 环境之后也可运行,步骤如下。 From 891a19dd083c6bec177d8a6c040df7b4f2a3cb7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=94=E5=BA=86=E6=89=8D=E4=B8=A8=E9=9D=99=E8=A7=85?= <1016903103@qq.com> Date: Sun, 1 Nov 2020 00:22:26 +0800 Subject: [PATCH 04/65] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 74ad0307..5be50019 100644 --- a/README.md +++ b/README.md @@ -65,9 +65,9 @@ proxypool | 2020-02-19 17:09:46,596 INFO success: tester entered RUNNING stat 这时候访问 [http://localhost:5555/random](http://localhost:5555/random) 即可获取一个随机可用代理。 -如果下载速度特别慢,可以自行修改 Dockerfile,修改改行: +如果下载速度特别慢,可以自行修改 Dockerfile,修改: -``` +```diff - RUN pip install -r requirements.txt + RUN pip install -r requirements.txt -i https://pypi.douban.com/simple ``` From ede3fbbdbfdc86fec5c53396c21f533af4555832 Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Sun, 3 Jan 2021 02:16:48 +0800 Subject: [PATCH 05/65] update docker-compose --- docker-compose.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index f0181db2..98ba86df 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,12 +1,12 @@ version: '3' services: - redis: + redis4proxypool: image: redis:alpine - container_name: redis + container_name: redis4proxypool command: redis-server ports: - "6379:6379" - restart: always +# restart: always proxypool: build: . image: 'germey/proxypool' @@ -17,4 +17,4 @@ services: volumes: - /tmp/proxypool/crawlers/private:/app/proxypool/crawlers/private environment: - REDIS_HOST: redis \ No newline at end of file + REDIS_HOST: redis4proxypool \ No newline at end of file From 08054f7c1e90c3de5a02f34f5a9e77f7b4b10687 Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Sun, 3 Jan 2021 02:18:44 +0800 Subject: [PATCH 06/65] rm volumes --- docker-compose.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 98ba86df..c39ded37 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,7 +6,7 @@ services: command: redis-server ports: - "6379:6379" -# restart: always + # restart: always proxypool: build: . image: 'germey/proxypool' @@ -14,7 +14,7 @@ services: ports: - "5555:5555" restart: always - volumes: - - /tmp/proxypool/crawlers/private:/app/proxypool/crawlers/private + # volumes: + # - proxypool/crawlers/private:/app/proxypool/crawlers/private environment: REDIS_HOST: redis4proxypool \ No newline at end of file From b730e9f88b022064aacf3233b4cfa3255ffb0c16 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 4 Feb 2021 01:53:07 +0800 Subject: [PATCH 07/65] Bump lxml from 4.3.3 to 4.6.2 (#105) Bumps [lxml](https://github.com/lxml/lxml) from 4.3.3 to 4.6.2. - [Release notes](https://github.com/lxml/lxml/releases) - [Changelog](https://github.com/lxml/lxml/blob/master/CHANGES.txt) - [Commits](https://github.com/lxml/lxml/compare/lxml-4.3.3...lxml-4.6.2) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index cc0b6111..0e86c5a7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,4 @@ loguru==0.3.2 pyquery==1.4.0 supervisor==4.1.0 redis==2.10.6 -lxml==4.3.3 \ No newline at end of file +lxml==4.6.2 \ No newline at end of file From f91725475cdf9f6533bd19bc88ba79234d82a73a Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Fri, 5 Feb 2021 00:40:50 +0800 Subject: [PATCH 08/65] rm log --- examples/usage.py | 2 +- proxypool/setting.py | 4 ++-- supervisord.conf | 14 +++++++++++++- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/examples/usage.py b/examples/usage.py index 87c73b35..bc699ba9 100644 --- a/examples/usage.py +++ b/examples/usage.py @@ -2,7 +2,7 @@ proxypool_url = 'http://127.0.0.1:5555/random' -target_url = 'http://httpbin.org/get' +target_url = 'https://antispider5.scrape.center/' def get_random_proxy(): diff --git a/proxypool/setting.py b/proxypool/setting.py index 7c3008b3..5765714b 100644 --- a/proxypool/setting.py +++ b/proxypool/setting.py @@ -76,6 +76,6 @@ ENABLE_GETTER = env.bool('ENABLE_GETTER', True) ENABLE_SERVER = env.bool('ENABLE_SERVER', True) -logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), level='DEBUG', rotation='1 week', retention='20 days') -logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), level='ERROR', rotation='1 week') +# logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), level='DEBUG', rotation='1 week', retention='20 days') +# logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), level='ERROR', rotation='1 week') diff --git a/supervisord.conf b/supervisord.conf index c5828e0c..97f2f86f 100644 --- a/supervisord.conf +++ b/supervisord.conf @@ -5,13 +5,25 @@ nodaemon=true process_name=tester command=python3 run.py --processor tester directory=/app +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 [program:getter] process_name=getter command=python3 run.py --processor getter directory=/app +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 [program:server] process_name=server command=python3 run.py --processor server -directory=/app \ No newline at end of file +directory=/app +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 \ No newline at end of file From 9912b980bbe7b1354cac99664fa3fb95db218461 Mon Sep 17 00:00:00 2001 From: xionghaizicuncunzhang <57612793+xionghaizicuncunzhang@users.noreply.github.com> Date: Sun, 7 Feb 2021 02:54:05 +0800 Subject: [PATCH 09/65] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=96=B0=E7=9A=84?= =?UTF-8?q?=E4=BB=A3=E7=90=86=E6=BA=90=E7=88=AC=E5=8F=96=E2=80=94=E2=80=94?= =?UTF-8?q?=E5=A4=AA=E9=98=B3=E4=BB=A3=E7=90=86=20(#107)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add files via upload 新的代理爬虫 * Delete taiyangdaili.py * add taiyang 爬取太阳代理免费ip --- proxypool/crawlers/public/taiyangdaili.py | 31 +++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 proxypool/crawlers/public/taiyangdaili.py diff --git a/proxypool/crawlers/public/taiyangdaili.py b/proxypool/crawlers/public/taiyangdaili.py new file mode 100644 index 00000000..7a48cb43 --- /dev/null +++ b/proxypool/crawlers/public/taiyangdaili.py @@ -0,0 +1,31 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +from pyquery import PyQuery as pq + +BaseUrl = 'http://www.taiyanghttp.com/free/page{num}' +MAX_PAGE = 5 + + +class TaiyangdailiCrawler(BaseCrawler): + """ + taiyangdaili crawler, http://www.taiyanghttp.com/free/ + """ + urls = [BaseUrl.format(num=i) for i in range(1, 6)] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + trs = doc('#ip_list .tr.ip_tr').items() + for tr in trs: + host = tr.find('div:nth-child(1)').text() + port = tr.find('div:nth-child(2)').text() + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = TaiyangdailiCrawler() + for proxy in crawler.crawl(): + print(proxy) From e3bbd55ab73814c2e2ae4fda859ef147eab6277e Mon Sep 17 00:00:00 2001 From: "j.yao.SUSE" <everhopingandwaiting@users.noreply.github.com> Date: Sun, 7 Feb 2021 02:54:29 +0800 Subject: [PATCH 10/65] =?UTF-8?q?=E6=B7=BB=E5=8A=A0IP=E4=BB=A3=E7=90=86?= =?UTF-8?q?=E7=88=AC=E5=8F=96=20(#106)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Create ip89.py www.89ip.cn 免费代理 * Update ip89.py update Class name * Create fatezero_proxylist.py 增加 http://proxylist.fatezero.org/ 代理 * Create ihuan.py i幻 代理 --- .../crawlers/public/fatezero_proxylist.py | 32 +++++++++++++++++ proxypool/crawlers/public/ihuan.py | 34 +++++++++++++++++++ proxypool/crawlers/public/ip89.py | 33 ++++++++++++++++++ 3 files changed, 99 insertions(+) create mode 100644 proxypool/crawlers/public/fatezero_proxylist.py create mode 100644 proxypool/crawlers/public/ihuan.py create mode 100644 proxypool/crawlers/public/ip89.py diff --git a/proxypool/crawlers/public/fatezero_proxylist.py b/proxypool/crawlers/public/fatezero_proxylist.py new file mode 100644 index 00000000..8a7d6e27 --- /dev/null +++ b/proxypool/crawlers/public/fatezero_proxylist.py @@ -0,0 +1,32 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import re +import json +BASE_URL = 'http://proxylist.fatezero.org/proxy.list' + + +class FatezeroCrawler(BaseCrawler): + """ + Fatezero crawler,http://proxylist.fatezero.org + """ + urls = [BASE_URL] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + + hosts_ports = html.split('\n') + for addr in hosts_ports: + ip_address = json.loads(addr) + if(True): + host = ip_address['host'] + port = ip_address['port'] + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = FatezeroCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/ihuan.py b/proxypool/crawlers/public/ihuan.py new file mode 100644 index 00000000..7386b705 --- /dev/null +++ b/proxypool/crawlers/public/ihuan.py @@ -0,0 +1,34 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import re +from pyquery import PyQuery as pq +import time +BASE_URL = 'https://ip.ihuan.me/today/{path}.html' + + +class IhuanCrawler(BaseCrawler): + """ + ip ihuan crawler, https://ip.ihuan.me + """ + urls = [BASE_URL.format(path=time.strftime("%Y/%m/%d/%H", time.localtime()))] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + # doc = pq(html)('.text-left') + ip_address = re.compile('([\d:\.]*).*?<br>') + hosts_ports = ip_address.findall(html) + for addr in hosts_ports: + addr_split = addr.split(':') + if(len(addr_split) == 2): + host = addr_split[0] + port = addr_split[1] + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = IhuanCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/ip89.py b/proxypool/crawlers/public/ip89.py new file mode 100644 index 00000000..f67c3870 --- /dev/null +++ b/proxypool/crawlers/public/ip89.py @@ -0,0 +1,33 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import re + +MAX_NUM = 9999 +BASE_URL = 'http://api.89ip.cn/tqdl.html?api=1&num={MAX_NUM}&port=&address=&isp='.format(MAX_NUM=MAX_NUM) + + +class Ip89Crawler(BaseCrawler): + """ + 89ip crawler, http://api.89ip.cn + """ + urls = [BASE_URL] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + ip_address = re.compile('([\d:\.]*)<br>') + hosts_ports = ip_address.findall(html) + for addr in hosts_ports: + addr_split = addr.split(':') + if(len(addr_split) == 2): + host = addr_split[0] + port = addr_split[1] + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = Ip89Crawler() + for proxy in crawler.crawl(): + print(proxy) From 63ca6da268073f6254cbb3dfeb32ec88226ec538 Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Sun, 14 Feb 2021 01:29:00 +0800 Subject: [PATCH 11/65] update ignore --- .dockerignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.dockerignore b/.dockerignore index 803baf5e..f9306766 100644 --- a/.dockerignore +++ b/.dockerignore @@ -128,4 +128,6 @@ venv.bak/ dmypy.json # Pyre type checker -.pyre/ \ No newline at end of file +.pyre/ + +proxypool/.env \ No newline at end of file From cf03d87bfaec0a932d135125182f98b317af757f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Mar 2021 02:30:39 +0800 Subject: [PATCH 12/65] Bump aiohttp from 3.6.2 to 3.7.4 (#112) Bumps [aiohttp](https://github.com/aio-libs/aiohttp) from 3.6.2 to 3.7.4. - [Release notes](https://github.com/aio-libs/aiohttp/releases) - [Changelog](https://github.com/aio-libs/aiohttp/blob/master/CHANGES.rst) - [Commits](https://github.com/aio-libs/aiohttp/compare/v3.6.2...v3.7.4) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0e86c5a7..74c8426c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ environs==7.2.0 Flask==1.0.3 attrs==19.1.0 retrying==1.3.3 -aiohttp==3.6.2 +aiohttp==3.7.4 requests==2.22.0 loguru==0.3.2 pyquery==1.4.0 From 4878bf5ae35fd767f2bd3c83f1333850cf64e22f Mon Sep 17 00:00:00 2001 From: "j.yao.SUSE" <everhopingandwaiting@users.noreply.github.com> Date: Mon, 8 Mar 2021 02:32:41 +0800 Subject: [PATCH 13/65] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=8B=A5=E5=B9=B2?= =?UTF-8?q?=E4=BB=A3=E7=90=86=EF=BC=8C=20=E4=BC=98=E5=8C=96=E9=83=A8?= =?UTF-8?q?=E5=88=86=E4=BB=A3=E7=A0=81=20(#108)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Create ip89.py www.89ip.cn 免费代理 * Update ip89.py update Class name * Create fatezero_proxylist.py 增加 http://proxylist.fatezero.org/ 代理 * Create ihuan.py i幻 代理 * update example usage2 * update requirements.txt * 优化 public crawlers * add proxy jiangxianli * tester 增加单个proxy测试方法 * reset setting Dockerfile docker-compose to default Co-authored-by: jy <jy@gail.com> Co-authored-by: 崔庆才丨静觅 <1016903103@qq.com> --- Dockerfile | 3 +- docker-compose.yml | 2 +- examples/usage2.py | 95 +++++++++++++++++++ proxypool/crawlers/base.py | 9 +- proxypool/crawlers/public/daili66.py | 2 +- .../crawlers/public/fatezero_proxylist.py | 5 +- proxypool/crawlers/public/goubanjia.py | 44 +++++++++ proxypool/crawlers/public/ihuan.py | 5 +- proxypool/crawlers/public/ip3366.py | 6 +- proxypool/crawlers/public/jiangxianli.py | 35 +++++++ proxypool/crawlers/public/kuaidaili.py | 6 +- proxypool/crawlers/public/zhandaye.py | 2 +- proxypool/processors/server.py | 15 +++ proxypool/processors/tester.py | 7 ++ proxypool/storages/redis.py | 4 +- requirements.txt | 20 ++-- 16 files changed, 231 insertions(+), 29 deletions(-) create mode 100644 examples/usage2.py create mode 100644 proxypool/crawlers/public/goubanjia.py create mode 100644 proxypool/crawlers/public/jiangxianli.py diff --git a/Dockerfile b/Dockerfile index dab1227e..4b8f9c0e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,7 @@ FROM python:3.6 WORKDIR /app COPY . . -RUN pip install -r requirements.txt +# RUN pip install -r requirements.txt -i https://pypi.douban.com/simple +RUN pip install -r requirements.txt -i VOLUME ["/app/proxypool/crawlers/private"] CMD ["supervisord", "-c", "supervisord.conf"] diff --git a/docker-compose.yml b/docker-compose.yml index c39ded37..03d85de7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,7 +6,7 @@ services: command: redis-server ports: - "6379:6379" - # restart: always + # restart: always proxypool: build: . image: 'germey/proxypool' diff --git a/examples/usage2.py b/examples/usage2.py new file mode 100644 index 00000000..918c5eb2 --- /dev/null +++ b/examples/usage2.py @@ -0,0 +1,95 @@ +# -*- coding: UTF-8 -*- + +''' +''' +import requests +import time +import threading +import urllib3 +from fake_headers import Headers +import uuid +from geolite2 import geolite2 +ips = [] + +# 爬数据的线程类 + +def getChinaIP(ip='127.0.0.1'): + reader = geolite2.reader() + ip_info = reader.get(ip) + geolite2.close() + print(ip_info) + return True if ip_info['country']['iso_code'] == 'CN' else False + + + +class CrawlThread(threading.Thread): + def __init__(self, proxyip): + super(CrawlThread, self).__init__() + self.proxyip = proxyip + + def run(self): + # 开始计时 + pure_ip_address = self.proxyip.split(':')[0] + # 验证IP归属 + if not getChinaIP(pure_ip_address): + # pass + raise ValueError('不是有效IP') + # + start = time.time() + # 消除关闭证书验证的警告 + urllib3.disable_warnings() + headers = Headers(headers=True).generate() + headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676' + headers['Pragma'] = 'no-cache' + headers['Host'] = 'bb.cf08tp.cn' + headers['x-forward-for'] = pure_ip_address + headers['Cookie'] = 'PHPSESSID={}'.format( + ''.join(str(uuid.uuid1()).split('-'))) + print(headers) + html = requests.get(headers=headers, url=targetUrl, proxies={ + "http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode() + # 结束计时 + end = time.time() + # 输出内容 + print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) + + "毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************") + +# 获取代理IP的线程类 + + +class GetIpThread(threading.Thread): + def __init__(self, fetchSecond): + super(GetIpThread, self).__init__() + self.fetchSecond = fetchSecond + + def run(self): + global ips + while True: + # 获取IP列表 + res = requests.get(apiUrl).content.decode() + # 按照\n分割获取到的IP + ips = res.split('\n') + # 利用每一个IP + for proxyip in ips: + if proxyip.strip(): + # 开启一个线程 + # CrawlThread(proxyip).start() + try: + CrawlThread(proxyip).run() + time.sleep(1.5) + except Exception as e: + print(e) + # 休眠 + time.sleep(len(ips) /self.fetchSecond ) + + +if __name__ == '__main__': + # 获取IP的API接口 + # apiUrl = "http://127.0.0.1:5555/all" + apiUrl = "http://127.0.0.1:5555/random" + # 要抓取的目标网站地址 + targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp=" + # targetUrl = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335608&id=2676&tp=' + fetchSecond = 5 + # 开始自动获取IP + GetIpThread(fetchSecond).start() diff --git a/proxypool/crawlers/base.py b/proxypool/crawlers/base.py index aa35430e..563d49bb 100644 --- a/proxypool/crawlers/base.py +++ b/proxypool/crawlers/base.py @@ -2,17 +2,19 @@ import requests from loguru import logger from proxypool.setting import GET_TIMEOUT - - +from fake_headers import Headers +import time class BaseCrawler(object): urls = [] @retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None, wait_fixed=2000) def fetch(self, url, **kwargs): try: + headers = Headers(headers=True).generate() kwargs.setdefault('timeout', GET_TIMEOUT) kwargs.setdefault('verify', False) - response = requests.get(url, **kwargs) + kwargs.setdefault('headers', headers) + response = requests.get(url ,**kwargs) if response.status_code == 200: response.encoding = 'utf-8' return response.text @@ -27,6 +29,7 @@ def crawl(self): for url in self.urls: logger.info(f'fetching {url}') html = self.fetch(url) + time.sleep(.5) for proxy in self.parse(html): logger.info(f'fetched proxy {proxy.string()} from {url}') yield proxy diff --git a/proxypool/crawlers/public/daili66.py b/proxypool/crawlers/public/daili66.py index 09a3ee45..7b3bf7c2 100644 --- a/proxypool/crawlers/public/daili66.py +++ b/proxypool/crawlers/public/daili66.py @@ -4,7 +4,7 @@ BASE_URL = 'http://www.66ip.cn/{page}.html' -MAX_PAGE = 5 +MAX_PAGE = 50 class Daili66Crawler(BaseCrawler): diff --git a/proxypool/crawlers/public/fatezero_proxylist.py b/proxypool/crawlers/public/fatezero_proxylist.py index 8a7d6e27..681cf9e4 100644 --- a/proxypool/crawlers/public/fatezero_proxylist.py +++ b/proxypool/crawlers/public/fatezero_proxylist.py @@ -19,13 +19,12 @@ def parse(self, html): hosts_ports = html.split('\n') for addr in hosts_ports: - ip_address = json.loads(addr) - if(True): + if(addr): + ip_address = json.loads(addr) host = ip_address['host'] port = ip_address['port'] yield Proxy(host=host, port=port) - if __name__ == '__main__': crawler = FatezeroCrawler() for proxy in crawler.crawl(): diff --git a/proxypool/crawlers/public/goubanjia.py b/proxypool/crawlers/public/goubanjia.py new file mode 100644 index 00000000..57157858 --- /dev/null +++ b/proxypool/crawlers/public/goubanjia.py @@ -0,0 +1,44 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import re +from pyquery import PyQuery as pq +import time +BASE_URL = 'http://www.goubanjia.com/' + + +class GoubanjiaCrawler(BaseCrawler): + """ + ip Goubanjia crawler, http://www.goubanjia.com/ + """ + urls = [BASE_URL] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html)('.ip').items() + # ''.join([*filter(lambda x: x != '',re.compile('\>([\d:\.]*)\<').findall(td.html()))]) + for td in doc: + trs = td.children() + ip_str = '' + for tr in trs: + attrib = tr.attrib + if 'style' in attrib and 'none' in tr.attrib['style']: + continue + ip_str+= '' if not tr.text else tr.text + addr_split = ip_str.split(':') + if(len(addr_split) == 2): + host = addr_split[0] + port = addr_split[1] + yield Proxy(host=host, port=port) + else: + port = trs[-1].text + host = ip_str.replace(port,'') + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = GoubanjiaCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/ihuan.py b/proxypool/crawlers/public/ihuan.py index 7386b705..ccf90a13 100644 --- a/proxypool/crawlers/public/ihuan.py +++ b/proxypool/crawlers/public/ihuan.py @@ -10,8 +10,9 @@ class IhuanCrawler(BaseCrawler): """ ip ihuan crawler, https://ip.ihuan.me """ - urls = [BASE_URL.format(path=time.strftime("%Y/%m/%d/%H", time.localtime()))] - + path = time.strftime("%Y/%m/%d/%H", time.localtime()) + urls = [BASE_URL.format(path=path)] + ignore = False def parse(self, html): """ parse html file to get proxies diff --git a/proxypool/crawlers/public/ip3366.py b/proxypool/crawlers/public/ip3366.py index 78d29447..474a4f77 100644 --- a/proxypool/crawlers/public/ip3366.py +++ b/proxypool/crawlers/public/ip3366.py @@ -3,15 +3,15 @@ import re -MAX_PAGE = 5 -BASE_URL = 'http://www.ip3366.net/free/?stype=1&page={page}' +MAX_PAGE = 8 +BASE_URL = 'http://www.ip3366.net/free/?stype={stype}&page={page}' class IP3366Crawler(BaseCrawler): """ ip3366 crawler, http://www.ip3366.net/ """ - urls = [BASE_URL.format(page=i) for i in range(1, 8)] + urls = [BASE_URL.format(stype=stype,page=i) for stype in range(1,3) for i in range(1, 8)] def parse(self, html): """ diff --git a/proxypool/crawlers/public/jiangxianli.py b/proxypool/crawlers/public/jiangxianli.py new file mode 100644 index 00000000..14fc46cc --- /dev/null +++ b/proxypool/crawlers/public/jiangxianli.py @@ -0,0 +1,35 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import re +import json +BASE_URL = 'https://ip.jiangxianli.com/api/proxy_ips?page={page}' + +MAX_PAGE = 10 +class JiangxianliCrawler(BaseCrawler): + """ + jiangxianli crawler,https://ip.jiangxianli.com/ + """ + urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + + result =json.loads(html) + if result['code'] != 0: + return + MAX_PAGE = int(result['data']['last_page']) + hosts_ports = result['data']['data'] + for ip_address in hosts_ports: + if(ip_address): + host = ip_address['ip'] + port = ip_address['port'] + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = JiangxianliCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/kuaidaili.py b/proxypool/crawlers/public/kuaidaili.py index f3fa6437..71ab1717 100644 --- a/proxypool/crawlers/public/kuaidaili.py +++ b/proxypool/crawlers/public/kuaidaili.py @@ -4,15 +4,15 @@ from pyquery import PyQuery as pq -BASE_URL = 'https://www.kuaidaili.com/free/inha/{page}/' -MAX_PAGE = 5 +BASE_URL = 'https://www.kuaidaili.com/free/{type}/{page}/' +MAX_PAGE = 300 class KuaidailiCrawler(BaseCrawler): """ kuaidaili crawler, https://www.kuaidaili.com/ """ - urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)] + urls = [BASE_URL.format(type=type,page=page) for type in ('intr','inha') for page in range(1, MAX_PAGE + 1)] def parse(self, html): """ diff --git a/proxypool/crawlers/public/zhandaye.py b/proxypool/crawlers/public/zhandaye.py index b6278a28..83af04b6 100755 --- a/proxypool/crawlers/public/zhandaye.py +++ b/proxypool/crawlers/public/zhandaye.py @@ -6,7 +6,7 @@ BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html' -MAX_PAGE = 5 +MAX_PAGE = 5 * 2 class ZhandayeCrawler(BaseCrawler): """ diff --git a/proxypool/processors/server.py b/proxypool/processors/server.py index e87f82f5..d3edd70d 100644 --- a/proxypool/processors/server.py +++ b/proxypool/processors/server.py @@ -37,6 +37,21 @@ def get_proxy(): return conn.random().string() +@app.route('/all') +def get_proxy_all(): + """ + get a random proxy + :return: get a random proxy + """ + conn = get_conn() + proxies = conn.all() + proxies_string = '' + for proxy in proxies: + proxies_string += str(proxy) + '\n' + + return proxies_string + + @app.route('/count') def get_count(): """ diff --git a/proxypool/processors/tester.py b/proxypool/processors/tester.py index e0812110..f002056a 100644 --- a/proxypool/processors/tester.py +++ b/proxypool/processors/tester.py @@ -84,7 +84,14 @@ def run(self): if not cursor: break +def run_tester(): + host = '96.113.165.182' + port = '3128' + tasks = [tester.test(Proxy(host=host, port=port))] + tester.loop.run_until_complete(asyncio.wait(tasks)) if __name__ == '__main__': tester = Tester() tester.run() + # run_tester() + diff --git a/proxypool/storages/redis.py b/proxypool/storages/redis.py index 60a03e9f..0ebbccc2 100644 --- a/proxypool/storages/redis.py +++ b/proxypool/storages/redis.py @@ -51,11 +51,11 @@ def random(self) -> Proxy: :return: proxy, like 8.8.8.8:8 """ # try to get proxy with max score - proxies = self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MAX) + proxies = self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MAX , PROXY_SCORE_MAX) if len(proxies): return convert_proxy_or_proxies(choice(proxies)) # else get proxy by rank - proxies = self.db.zrevrange(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX) + proxies = self.db.zrevrange(REDIS_KEY, PROXY_SCORE_MIN , PROXY_SCORE_MAX) if len(proxies): return convert_proxy_or_proxies(choice(proxies)) # else raise error diff --git a/requirements.txt b/requirements.txt index 74c8426c..3bccfa78 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,13 @@ -environs==7.2.0 -Flask==1.0.3 -attrs==19.1.0 +environs==9.3.0 +Flask==1.1.2 +attrs==20.3.0 retrying==1.3.3 aiohttp==3.7.4 -requests==2.22.0 -loguru==0.3.2 -pyquery==1.4.0 -supervisor==4.1.0 -redis==2.10.6 -lxml==4.6.2 \ No newline at end of file +requests==2.25.1 +loguru==0.5.3 +pyquery==1.4.3 +supervisor==4.2.1 +redis==3.5.3 +lxml==4.6.2 +fake_headers==1.0.2 +maxminddb_geolite2==2018.703 From 84d738b8e0134bb7772ef05750b18147a0a56a13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=94=E5=BA=86=E6=89=8D=E4=B8=A8=E9=9D=99=E8=A7=85?= <1016903103@qq.com> Date: Mon, 8 Mar 2021 02:34:16 +0800 Subject: [PATCH 14/65] Update Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 4b8f9c0e..280f8b07 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,6 @@ FROM python:3.6 WORKDIR /app COPY . . # RUN pip install -r requirements.txt -i https://pypi.douban.com/simple -RUN pip install -r requirements.txt -i +RUN pip install -r requirements.txt VOLUME ["/app/proxypool/crawlers/private"] CMD ["supervisord", "-c", "supervisord.conf"] From 216c58f85b210d83edd8237d3c74fd7f0d7d353c Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Sun, 28 Mar 2021 19:46:26 +0800 Subject: [PATCH 15/65] change port --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 03d85de7..5716aca6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,7 +5,7 @@ services: container_name: redis4proxypool command: redis-server ports: - - "6379:6379" + - "6374:6379" # restart: always proxypool: build: . From 3884b660d21ed003863f757c9a95c51323c5dd9d Mon Sep 17 00:00:00 2001 From: K8sCat <k8scat@gmail.com> Date: Thu, 1 Apr 2021 00:57:00 +0800 Subject: [PATCH 16/65] :fire: optimize: remove redundant command (#114) --- docker-compose.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 5716aca6..c069ec56 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,15 +1,14 @@ -version: '3' +version: "3" services: redis4proxypool: image: redis:alpine container_name: redis4proxypool - command: redis-server ports: - "6374:6379" # restart: always proxypool: build: . - image: 'germey/proxypool' + image: "germey/proxypool" container_name: proxypool ports: - "5555:5555" @@ -17,4 +16,4 @@ services: # volumes: # - proxypool/crawlers/private:/app/proxypool/crawlers/private environment: - REDIS_HOST: redis4proxypool \ No newline at end of file + REDIS_HOST: redis4proxypool From 0e9935d9b3c17bd0b902b88642ba7c8c89afea3a Mon Sep 17 00:00:00 2001 From: K8sCat <k8scat@gmail.com> Date: Tue, 13 Apr 2021 23:45:34 +0800 Subject: [PATCH 17/65] optimize: use python:3.6-alpine (#116) * :zap: optimize: use python:3.6-alpine * :rocket: update: del build pkgs --- Dockerfile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 280f8b07..c38bf89b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,9 @@ -FROM python:3.6 +FROM python:3.6-alpine WORKDIR /app COPY . . # RUN pip install -r requirements.txt -i https://pypi.douban.com/simple -RUN pip install -r requirements.txt +RUN apk add --no-cache libxml2-dev libxslt-dev gcc musl-dev && \ +pip install -r requirements.txt && \ +apk del gcc musl-dev libxml2-dev VOLUME ["/app/proxypool/crawlers/private"] CMD ["supervisord", "-c", "supervisord.conf"] From ab917a8f0a6d65ce771501539047c776851a0e67 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 13 Apr 2021 23:46:45 +0800 Subject: [PATCH 18/65] Bump lxml from 4.6.2 to 4.6.3 (#115) Bumps [lxml](https://github.com/lxml/lxml) from 4.6.2 to 4.6.3. - [Release notes](https://github.com/lxml/lxml/releases) - [Changelog](https://github.com/lxml/lxml/blob/master/CHANGES.txt) - [Commits](https://github.com/lxml/lxml/compare/lxml-4.6.2...lxml-4.6.3) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3bccfa78..afa85606 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,6 @@ loguru==0.5.3 pyquery==1.4.3 supervisor==4.2.1 redis==3.5.3 -lxml==4.6.2 +lxml==4.6.3 fake_headers==1.0.2 maxminddb_geolite2==2018.703 From ad5d25406790548dee96459945ebdcb97654e4e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=94=E5=BA=86=E6=89=8D=E4=B8=A8=E9=9D=99=E8=A7=85?= <cqc@cuiqingcai.com> Date: Tue, 1 Jun 2021 02:32:20 +0800 Subject: [PATCH 19/65] Update README.md --- README.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5be50019..fafb8d7a 100644 --- a/README.md +++ b/README.md @@ -20,9 +20,20 @@ API Server 可以见[部署样例](https://proxypool.scrape.center/),随机代 本样例为 GitHub Actions + Kubernetes 自动部署 master 分支代码结果。 +## 使用准备 + +首先当然是克隆代码并进入 ProxyPool 文件夹: + +``` +git clone https://github.com/Python3WebSpider/ProxyPool.git +cd ProxyPool +``` + +然后选用下面 Docker 和常规方式任意一个执行即可。 + ## 使用要求 -可以通过两种方式来运行代理池,一种方式是使用 Docker(推荐),另一种方式是常规方式运行。 +可以通过两种方式来运行代理池,一种方式是使用 Docker(推荐),另一种方式是常规方式运行,要求如下: ### Docker @@ -31,6 +42,8 @@ API Server 可以见[部署样例](https://proxypool.scrape.center/),随机代 * Docker * Docker-Compose +安装方法自行搜索即可。 + ### 常规方式 常规方式要求有 Python 环境、Redis 环境,具体要求如下: From 1de4f95d3ac8b7fc1657f8e4fd127b263709be0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=94=E5=BA=86=E6=89=8D=E4=B8=A8=E9=9D=99=E8=A7=85?= <cqc@cuiqingcai.com> Date: Tue, 1 Jun 2021 02:32:56 +0800 Subject: [PATCH 20/65] Update README.md --- README.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/README.md b/README.md index fafb8d7a..441f7c4e 100644 --- a/README.md +++ b/README.md @@ -14,12 +14,6 @@ 代理池原理解析可见「[如何搭建一个高效的代理池](https://cuiqingcai.com/7048.html)」,建议使用之前阅读。 -## 运行示例 - -API Server 可以见[部署样例](https://proxypool.scrape.center/),随机代理[取用地址](https://proxypool.scrape.center/random),代理源比较少,仅供演示。 - -本样例为 GitHub Actions + Kubernetes 自动部署 master 分支代码结果。 - ## 使用准备 首先当然是克隆代码并进入 ProxyPool 文件夹: From dac05be5db7f6c81db5461259ac4fb37e3743616 Mon Sep 17 00:00:00 2001 From: wc571498244 <51046547+wc571498244@users.noreply.github.com> Date: Tue, 12 Oct 2021 21:46:20 +0800 Subject: [PATCH 21/65] =?UTF-8?q?=E6=96=B0=E5=A2=9E=EF=BC=9A=E5=B0=8F?= =?UTF-8?q?=E8=88=92=E4=BB=A3=E7=90=86=E3=80=81seo=E6=96=B9=E6=B3=95?= =?UTF-8?q?=E4=BB=A3=E7=90=86=E3=80=81yqie=E4=BB=A3=E7=90=86=20(#124)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- proxypool/crawlers/public/seofangfa.py | 34 ++++++++++++++++ proxypool/crawlers/public/xiaoshudaili.py | 49 +++++++++++++++++++++++ proxypool/crawlers/public/yqie.py | 32 +++++++++++++++ 3 files changed, 115 insertions(+) create mode 100644 proxypool/crawlers/public/seofangfa.py create mode 100644 proxypool/crawlers/public/xiaoshudaili.py create mode 100644 proxypool/crawlers/public/yqie.py diff --git a/proxypool/crawlers/public/seofangfa.py b/proxypool/crawlers/public/seofangfa.py new file mode 100644 index 00000000..1f5a20a2 --- /dev/null +++ b/proxypool/crawlers/public/seofangfa.py @@ -0,0 +1,34 @@ +import requests +from pyquery import PyQuery as pq + +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler + +requests.packages.urllib3.disable_warnings() +BASE_URL = "https://proxy.seofangfa.com/" +MAX_PAGE = 1 + + +class SeoFangFaCrawler(BaseCrawler): + """ + seo方法 crawler, https://proxy.seofangfa.com/ + """ + urls = ["https://proxy.seofangfa.com/"] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + trs = doc('.table tr:gt(0)').items() + for tr in trs: + host = tr.find('td:nth-child(1)').text() + port = int(tr.find('td:nth-child(2)').text()) + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = SeoFangFaCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/xiaoshudaili.py b/proxypool/crawlers/public/xiaoshudaili.py new file mode 100644 index 00000000..bb4d7d42 --- /dev/null +++ b/proxypool/crawlers/public/xiaoshudaili.py @@ -0,0 +1,49 @@ +import re + +from pyquery import PyQuery as pq + +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler + +BASE_URL = "http://www.xsdaili.cn/" +PAGE_BASE_URL = "http://www.xsdaili.cn/dayProxy/ip/{page}.html" +MAX_PAGE = 50 + + +class XiaoShuCrawler(BaseCrawler): + """ + 小舒代理 crawler, http://www.xsdaili.cn/ + """ + + def __init__(self): + html = self.fetch(url=BASE_URL) + doc = pq(html) + title = doc(".title:eq(0) a").items() + + latest_page = 0 + for t in title: + res = re.search(r"/(\d+)\.html", t.attr("href")) + latest_page = int(res.group(1)) if res else 0 + if latest_page: + self.urls = [PAGE_BASE_URL.format(page=page) for page in range(latest_page - MAX_PAGE, latest_page)] + else: + self.urls = [] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + contents = doc('.cont').text() + contents = contents.split("\n") + for content in contents: + c = content[:content.find("@")] + host, port = c.split(":") + yield Proxy(host=host, port=int(port)) + + +if __name__ == '__main__': + crawler = XiaoShuCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/yqie.py b/proxypool/crawlers/public/yqie.py new file mode 100644 index 00000000..fb3feaf8 --- /dev/null +++ b/proxypool/crawlers/public/yqie.py @@ -0,0 +1,32 @@ +from pyquery import PyQuery as pq + +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler + +BASE_URL = "http://ip.yqie.com/ipproxy.htm" +MAX_PAGE = 1 + + +class YqIeCrawler(BaseCrawler): + """ + ip yqie crawler, http://ip.yqie.com/ipproxy.htm + """ + urls = [BASE_URL] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + trs = doc('#GridViewOrder tr:gt(0)').items() + for tr in trs: + host = tr.find('td:nth-child(1)').text() + port = int(tr.find('td:nth-child(2)').text()) + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = YqIeCrawler() + for proxy in crawler.crawl(): + print(proxy) From 47cb3bcbac80afafc4fd7d15e94216830c4e3599 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 15 Dec 2021 01:36:35 +0800 Subject: [PATCH 22/65] Bump lxml from 4.6.3 to 4.6.5 (#131) Bumps [lxml](https://github.com/lxml/lxml) from 4.6.3 to 4.6.5. - [Release notes](https://github.com/lxml/lxml/releases) - [Changelog](https://github.com/lxml/lxml/blob/master/CHANGES.txt) - [Commits](https://github.com/lxml/lxml/compare/lxml-4.6.3...lxml-4.6.5) --- updated-dependencies: - dependency-name: lxml dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index afa85606..aa5eddcc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,6 @@ loguru==0.5.3 pyquery==1.4.3 supervisor==4.2.1 redis==3.5.3 -lxml==4.6.3 +lxml==4.6.5 fake_headers==1.0.2 maxminddb_geolite2==2018.703 From e552819a4ac168ad3055fcb70fdcb2dd57dbcbca Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Fri, 24 Dec 2021 02:45:47 +0800 Subject: [PATCH 23/65] update docker-compose file --- .github/workflows/build.yml | 2 +- .github/workflows/deploy.yml | 2 +- README.md | 94 +++++++++++++++++++----------------- build.yml | 19 ++++++++ docker-compose.yml | 6 +-- 5 files changed, 73 insertions(+), 50 deletions(-) create mode 100644 build.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index fe8aa55e..d1724a5a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -16,7 +16,7 @@ jobs: - name: Docker Login run: docker login -u germey -p ${{ secrets.DOCKERHUB_LOGIN_PASSWORD }} - name: Build the Docker Image - run: docker-compose build + run: docker-compose -f build.yml build - name: Tag and Push Master Version run: | docker tag germey/proxypool germey/proxypool:master diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 8257ba0a..ac5a5c69 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -35,7 +35,7 @@ jobs: echo $BUILD_NUMBER - name: Build Push Deploy run: | - docker-compose build + docker-compose -f build.yml build docker tag germey/proxypool germey/proxypool:$BUILD_NUMBER docker push germey/proxypool docker push germey/proxypool:$BUILD_NUMBER diff --git a/README.md b/README.md index 441f7c4e..3db7a2ea 100644 --- a/README.md +++ b/README.md @@ -7,10 +7,10 @@ 简易高效的代理池,提供如下功能: -* 定时抓取免费代理网站,简易可扩展。 -* 使用 Redis 对代理进行存储并对代理可用性进行排序。 -* 定时测试和筛选,剔除不可用代理,留下可用代理。 -* 提供代理 API,随机取用测试通过的可用代理。 +- 定时抓取免费代理网站,简易可扩展。 +- 使用 Redis 对代理进行存储并对代理可用性进行排序。 +- 定时测试和筛选,剔除不可用代理,留下可用代理。 +- 提供代理 API,随机取用测试通过的可用代理。 代理池原理解析可见「[如何搭建一个高效的代理池](https://cuiqingcai.com/7048.html)」,建议使用之前阅读。 @@ -33,8 +33,8 @@ cd ProxyPool 如果使用 Docker,则需要安装如下环境: -* Docker -* Docker-Compose +- Docker +- Docker-Compose 安装方法自行搜索即可。 @@ -42,8 +42,8 @@ cd ProxyPool 常规方式要求有 Python 环境、Redis 环境,具体要求如下: -* Python>=3.6 -* Redis +- Python>=3.6 +- Redis ## Docker 运行 @@ -72,6 +72,12 @@ proxypool | 2020-02-19 17:09:46,596 INFO success: tester entered RUNNING stat 这时候访问 [http://localhost:5555/random](http://localhost:5555/random) 即可获取一个随机可用代理。 +当然你也可以选择自己 Build,直接运行如下命令即可: + +``` +docker-compose -f build.yml up +``` + 如果下载速度特别慢,可以自行修改 Dockerfile,修改: ```diff @@ -118,7 +124,7 @@ export REDIS_CONNECTION_STRING='redis://@host:port/db' ### 安装依赖包 -这里强烈推荐使用 [Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands) +这里强烈推荐使用 [Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands) 或 [virtualenv](https://virtualenv.pypa.io/en/latest/user_guide.html) 创建虚拟环境,Python 版本不低于 3.6。 然后 pip 安装依赖即可: @@ -198,15 +204,15 @@ if __name__ == '__main__': ``` get random proxy 116.196.115.209:8080 { - "args": {}, + "args": {}, "headers": { - "Accept": "*/*", - "Accept-Encoding": "gzip, deflate", - "Host": "httpbin.org", - "User-Agent": "python-requests/2.22.0", + "Accept": "*/*", + "Accept-Encoding": "gzip, deflate", + "Host": "httpbin.org", + "User-Agent": "python-requests/2.22.0", "X-Amzn-Trace-Id": "Root=1-5e4d7140-662d9053c0a2e513c7278364" - }, - "origin": "116.196.115.209", + }, + "origin": "116.196.115.209", "url": "https://httpbin.org/get" } ``` @@ -219,41 +225,41 @@ get random proxy 116.196.115.209:8080 ### 开关 -* ENABLE_TESTER:允许 Tester 启动,默认 true -* ENABLE_GETTER:允许 Getter 启动,默认 true -* ENABLE_SERVER:运行 Server 启动,默认 true +- ENABLE_TESTER:允许 Tester 启动,默认 true +- ENABLE_GETTER:允许 Getter 启动,默认 true +- ENABLE_SERVER:运行 Server 启动,默认 true ### 环境 -* APP_ENV:运行环境,可以设置 dev、test、prod,即开发、测试、生产环境,默认 dev -* APP_DEBUG:调试模式,可以设置 true 或 false,默认 true +- APP_ENV:运行环境,可以设置 dev、test、prod,即开发、测试、生产环境,默认 dev +- APP_DEBUG:调试模式,可以设置 true 或 false,默认 true ### Redis 连接 -* REDIS_HOST:Redis 的 Host -* REDIS_PORT:Redis 的端口 -* REDIS_PASSWORD:Redis 的密码 -* REDIS_DB:Redis 的数据库索引,如 0、1 -* REDIS_CONNECTION_STRING:Redis 连接字符串 -* REDIS_KEY:Redis 储存代理使用字典的名称 +- REDIS_HOST:Redis 的 Host +- REDIS_PORT:Redis 的端口 +- REDIS_PASSWORD:Redis 的密码 +- REDIS_DB:Redis 的数据库索引,如 0、1 +- REDIS_CONNECTION_STRING:Redis 连接字符串 +- REDIS_KEY:Redis 储存代理使用字典的名称 ### 处理器 -* CYCLE_TESTER:Tester 运行周期,即间隔多久运行一次测试,默认 20 秒 -* CYCLE_GETTER:Getter 运行周期,即间隔多久运行一次代理获取,默认 100 秒 -* TEST_URL:测试 URL,默认百度 -* TEST_TIMEOUT:测试超时时间,默认 10 秒 -* TEST_BATCH:批量测试数量,默认 20 个代理 -* TEST_VALID_STATUS:测试有效的状态吗 -* API_HOST:代理 Server 运行 Host,默认 0.0.0.0 -* API_PORT:代理 Server 运行端口,默认 5555 -* API_THREADED:代理 Server 是否使用多线程,默认 true +- CYCLE_TESTER:Tester 运行周期,即间隔多久运行一次测试,默认 20 秒 +- CYCLE_GETTER:Getter 运行周期,即间隔多久运行一次代理获取,默认 100 秒 +- TEST_URL:测试 URL,默认百度 +- TEST_TIMEOUT:测试超时时间,默认 10 秒 +- TEST_BATCH:批量测试数量,默认 20 个代理 +- TEST_VALID_STATUS:测试有效的状态吗 +- API_HOST:代理 Server 运行 Host,默认 0.0.0.0 +- API_PORT:代理 Server 运行端口,默认 5555 +- API_THREADED:代理 Server 是否使用多线程,默认 true ### 日志 -* LOG_DIR:日志相对路径 -* LOG_RUNTIME_FILE:运行日志文件名称 -* LOG_ERROR_FILE:错误日志文件名称 +- LOG_DIR:日志相对路径 +- LOG_RUNTIME_FILE:运行日志文件名称 +- LOG_ERROR_FILE:错误日志文件名称 以上内容均可使用环境变量配置,即在运行前设置对应环境变量值即可,如更改测试地址和 Redis 键名: @@ -267,7 +273,7 @@ export REDIS_KEY=proxies:weibo 如果使用 Docker-Compose 启动代理池,则需要在 docker-compose.yml 文件里面指定环境变量,如: ```yaml -version: '3' +version: "3" services: redis: image: redis:alpine @@ -278,7 +284,7 @@ services: restart: always proxypool: build: . - image: 'germey/proxypool' + image: "germey/proxypool" container_name: proxypool ports: - "5555:5555" @@ -310,7 +316,7 @@ class Daili66Crawler(BaseCrawler): daili66 crawler, http://www.66ip.cn/1.html """ urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)] - + def parse(self, html): """ parse html file to get proxies @@ -326,8 +332,8 @@ class Daili66Crawler(BaseCrawler): 在这里只需要定义一个 Crawler 继承 BaseCrawler 即可,然后定义好 urls 变量和 parse 方法即可。 -* urls 变量即为爬取的代理网站网址列表,可以用程序定义也可写成固定内容。 -* parse 方法接收一个参数即 html,代理网址的 html,在 parse 方法里只需要写好 html 的解析,解析出 host 和 port,并构建 Proxy 对象 yield 返回即可。 +- urls 变量即为爬取的代理网站网址列表,可以用程序定义也可写成固定内容。 +- parse 方法接收一个参数即 html,代理网址的 html,在 parse 方法里只需要写好 html 的解析,解析出 host 和 port,并构建 Proxy 对象 yield 返回即可。 网页的爬取不需要实现,BaseCrawler 已经有了默认实现,如需更改爬取方式,重写 crawl 方法即可。 diff --git a/build.yml b/build.yml new file mode 100644 index 00000000..c069ec56 --- /dev/null +++ b/build.yml @@ -0,0 +1,19 @@ +version: "3" +services: + redis4proxypool: + image: redis:alpine + container_name: redis4proxypool + ports: + - "6374:6379" + # restart: always + proxypool: + build: . + image: "germey/proxypool" + container_name: proxypool + ports: + - "5555:5555" + restart: always + # volumes: + # - proxypool/crawlers/private:/app/proxypool/crawlers/private + environment: + REDIS_HOST: redis4proxypool diff --git a/docker-compose.yml b/docker-compose.yml index c069ec56..8ab8228f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,15 +5,13 @@ services: container_name: redis4proxypool ports: - "6374:6379" - # restart: always proxypool: - build: . image: "germey/proxypool" container_name: proxypool ports: - "5555:5555" restart: always - # volumes: - # - proxypool/crawlers/private:/app/proxypool/crawlers/private + # volumes: + # - proxypool/crawlers/private:/app/proxypool/crawlers/private environment: REDIS_HOST: redis4proxypool From 30d5b0c4be4d4ed12b0c03e28203944d1c5bc607 Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Fri, 24 Dec 2021 02:47:34 +0800 Subject: [PATCH 24/65] update to master --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 8ab8228f..3c57f280 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,7 +6,7 @@ services: ports: - "6374:6379" proxypool: - image: "germey/proxypool" + image: "germey/proxypool:master" container_name: proxypool ports: - "5555:5555" From dcf93a3cc1d171e4bed759407468ece2de90f115 Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Fri, 24 Dec 2021 14:49:50 +0800 Subject: [PATCH 25/65] add kubernetes --- Dockerfile | 12 ++- README.md | 6 +- build.yml => build.yaml | 2 +- deployment.yml | 99 ------------------- ingress.yml | 32 ------ kubernetes/.helmignore | 24 +++++ kubernetes/Chart.yaml | 27 +++++ kubernetes/README.md | 37 +++++++ kubernetes/templates/_helpers.tpl | 53 ++++++++++ .../templates/proxypool-deployment.yaml | 29 ++++++ kubernetes/templates/proxypool-ingress.yaml | 41 ++++++++ kubernetes/templates/proxypool-service.yaml | 15 +++ kubernetes/templates/redis-deployment.yaml | 30 ++++++ kubernetes/templates/redis-service.yaml | 13 +++ kubernetes/values.yaml | 37 +++++++ 15 files changed, 316 insertions(+), 141 deletions(-) rename build.yml => build.yaml (91%) delete mode 100644 deployment.yml delete mode 100644 ingress.yml create mode 100644 kubernetes/.helmignore create mode 100644 kubernetes/Chart.yaml create mode 100644 kubernetes/README.md create mode 100644 kubernetes/templates/_helpers.tpl create mode 100644 kubernetes/templates/proxypool-deployment.yaml create mode 100644 kubernetes/templates/proxypool-ingress.yaml create mode 100644 kubernetes/templates/proxypool-service.yaml create mode 100644 kubernetes/templates/redis-deployment.yaml create mode 100644 kubernetes/templates/redis-service.yaml create mode 100644 kubernetes/values.yaml diff --git a/Dockerfile b/Dockerfile index c38bf89b..0db474f1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,13 @@ -FROM python:3.6-alpine +FROM nginx:alpine WORKDIR /app +RUN apk add --no-cache --virtual .build-deps g++ python3-dev libffi-dev \ + openssl-dev libxml2-dev libxslt-dev gcc musl-dev py3-pip && \ + apk add --no-cache --update python3 && \ + pip3 install --upgrade pip setuptools +COPY requirements.txt . +RUN pip3 install -r requirements.txt && \ +apk del g++ gcc musl-dev libxml2-dev COPY . . # RUN pip install -r requirements.txt -i https://pypi.douban.com/simple -RUN apk add --no-cache libxml2-dev libxslt-dev gcc musl-dev && \ -pip install -r requirements.txt && \ -apk del gcc musl-dev libxml2-dev VOLUME ["/app/proxypool/crawlers/private"] CMD ["supervisord", "-c", "supervisord.conf"] diff --git a/README.md b/README.md index 3db7a2ea..17a8fab3 100644 --- a/README.md +++ b/README.md @@ -341,11 +341,7 @@ class Daili66Crawler(BaseCrawler): ## 部署 -本项目提供了 Kubernetes 部署脚本,如需部署到 Kubernetes,执行如下命令即可: - -```shell script -cat deployment.yml | sed 's/\${TAG}/latest/g' | kubectl apply -f - -``` +本项目提供了 Kubernetes 部署脚本,如需部署到 Kubernetes,请参考 [kubernetes](./kubernetes)。 ## 待开发 diff --git a/build.yml b/build.yaml similarity index 91% rename from build.yml rename to build.yaml index c069ec56..82fd9559 100644 --- a/build.yml +++ b/build.yaml @@ -8,7 +8,7 @@ services: # restart: always proxypool: build: . - image: "germey/proxypool" + image: "germey/proxypool:master" container_name: proxypool ports: - "5555:5555" diff --git a/deployment.yml b/deployment.yml deleted file mode 100644 index c7aaea55..00000000 --- a/deployment.yml +++ /dev/null @@ -1,99 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - creationTimestamp: null - name: proxypool ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: proxypool - namespace: proxypool -spec: - storageClassName: azure-file - accessModes: - - ReadWriteMany - resources: - requests: - storage: 2Gi ---- -apiVersion: v1 -items: - - apiVersion: v1 - kind: Service - metadata: - annotations: - kompose.cmd: kompose convert -f docker-compose.yml -o deployment.yml - kompose.version: 1.20.0 () - creationTimestamp: null - labels: - io.kompose.service: proxypool - name: proxypool - namespace: proxypool - spec: - ports: - - name: "5555" - port: 5555 - targetPort: 5555 - selector: - io.kompose.service: proxypool - status: - loadBalancer: {} - - apiVersion: apps/v1 - kind: Deployment - metadata: - annotations: - kompose.cmd: kompose convert -f docker-compose.yml -o deployment.yml - kompose.version: 1.20.0 () - creationTimestamp: null - labels: - io.kompose.service: proxypool - name: proxypool - namespace: proxypool - spec: - replicas: 2 - revisionHistoryLimit: 1 - strategy: {} - selector: - matchLabels: - io.kompose.service: proxypool - template: - metadata: - annotations: - kompose.cmd: kompose convert -f docker-compose.yml -o deployment.yml - kompose.version: 1.20.0 () - creationTimestamp: null - labels: - io.kompose.service: proxypool - spec: - containers: - - env: - - name: REDIS_CONNECTION_STRING - valueFrom: - secretKeyRef: - name: redis - key: connection_string - - name: REDIS_PORT - value: '6379' - image: germey/proxypool:${TAG} - name: proxypool - resources: - limits: - memory: "500Mi" - cpu: "300m" - requests: - memory: "500Mi" - cpu: "300m" - ports: - - containerPort: 5555 - volumeMounts: - - mountPath: "/app/proxypool/crawlers/private" - name: proxypool - restartPolicy: Always - volumes: - - name: proxypool - persistentVolumeClaim: - claimName: pvc-proxypool - status: {} -kind: List -metadata: {} diff --git a/ingress.yml b/ingress.yml deleted file mode 100644 index 166eb729..00000000 --- a/ingress.yml +++ /dev/null @@ -1,32 +0,0 @@ ---- -apiVersion: networking.k8s.io/v1beta1 -kind: Ingress -metadata: - name: ingress-universal-proxypool - namespace: proxypool - annotations: - nginx.ingress.kubernetes.io/ssl-redirect: "true" - nginx.ingress.kubernetes.io/rewrite-target: / -spec: - tls: - - hosts: - - universal.proxypool.cuiqingcai.com - secretName: tls-wildcard-proxypool-cuiqingcai-com - - hosts: - - proxypool.scrape.center - secretName: tls-wildcard-scrape-center - rules: - - host: universal.proxypool.cuiqingcai.com - http: - paths: - - backend: - serviceName: proxypool - servicePort: 5555 - path: / - - host: proxypool.scrape.center - http: - paths: - - backend: - serviceName: proxypool - servicePort: 5555 - path: / \ No newline at end of file diff --git a/kubernetes/.helmignore b/kubernetes/.helmignore new file mode 100644 index 00000000..9716c30e --- /dev/null +++ b/kubernetes/.helmignore @@ -0,0 +1,24 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ +image/ \ No newline at end of file diff --git a/kubernetes/Chart.yaml b/kubernetes/Chart.yaml new file mode 100644 index 00000000..58db2bc2 --- /dev/null +++ b/kubernetes/Chart.yaml @@ -0,0 +1,27 @@ +apiVersion: v2 +name: proxypool +description: A Efficient Proxy Pool + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# Keywords about this application. +keywords: + - proxypool + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +appVersion: 1.16.0 diff --git a/kubernetes/README.md b/kubernetes/README.md new file mode 100644 index 00000000..70860d24 --- /dev/null +++ b/kubernetes/README.md @@ -0,0 +1,37 @@ +# Kubernetes 部署 + +这是用来快速部署本代理池的 Helm Charts。 + +首先需要有一个 Kubernetes 集群,其次需要安装 Helm,确保 helm 命令可以正常运行。 + +## 安装 + +安装直接使用 helm 命令在本文件夹运行即可,使用 `-n` 可以制定 NameSpace。 + +```shell +helm install proxypool-app . -n scrape +``` + +其中 proxypool-app 就是应用的名字,可以任意取名,它会用作代理池 Deplyment 的名称。 + +如果需要覆盖变量,可以修改 values.yaml 文件,执行如下命令安装: + +```shell +helm install proxypool-app . -f values.yaml -n scrape +``` + +## 更新 + +如果需要更新配置,可以修改 values.yaml 文件,执行如下命令更新版本: + +```shell +helm upgrade proxypool-app . -f values.yaml -n scrape +``` + +## 卸载 + +如果不想使用了,可以只用 uninstall 命令卸载: + +```shell +helm uninstall proxypool-app -n scrape +``` diff --git a/kubernetes/templates/_helpers.tpl b/kubernetes/templates/_helpers.tpl new file mode 100644 index 00000000..31911df1 --- /dev/null +++ b/kubernetes/templates/_helpers.tpl @@ -0,0 +1,53 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "proxypool.name" -}} +{{- default .Chart.Name .Values.name | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "proxypool.fullname" -}} +{{- if .Values.fullname }} +{{- .Values.fullname | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.name }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "proxypool.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "proxypool.labels" -}} +helm.sh/chart: {{ include "proxypool.chart" . }} +{{ include "proxypool.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "proxypool.selectorLabels" -}} +app.kubernetes.io/name: {{ include "proxypool.fullname" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} diff --git a/kubernetes/templates/proxypool-deployment.yaml b/kubernetes/templates/proxypool-deployment.yaml new file mode 100644 index 00000000..29ed068e --- /dev/null +++ b/kubernetes/templates/proxypool-deployment.yaml @@ -0,0 +1,29 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "proxypool.fullname" . }} + labels: + {{- include "proxypool.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.deployment.replicas }} + revisionHistoryLimit: {{ .Values.deployment.revisionHistoryLimit }} + selector: + matchLabels: + {{- include "proxypool.labels" . | nindent 8 }} + template: + metadata: + labels: + {{- include "proxypool.labels" . | nindent 8 }} + spec: + restartPolicy: {{ .Values.deployment.restartPolicy }} + containers: + - name: {{ include "proxypool.fullname" . }} + image: {{ .Values.deployment.image }} + ports: + - containerPort: 5555 + protocol: TCP + imagePullPolicy: {{ .Values.deployment.imagePullPolicy }} + resources: + {{- toYaml .Values.deployment.resources | nindent 12 }} + env: + {{- toYaml .Values.deployment.env | nindent 12 }} diff --git a/kubernetes/templates/proxypool-ingress.yaml b/kubernetes/templates/proxypool-ingress.yaml new file mode 100644 index 00000000..0706f5d2 --- /dev/null +++ b/kubernetes/templates/proxypool-ingress.yaml @@ -0,0 +1,41 @@ +{{- if .Values.ingress.enabled -}} +{{- $fullName := include "proxypool.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +{{- if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1beta1 +{{- else -}} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $fullName }} + labels: + {{- include "proxypool.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ . }} + backend: + serviceName: {{ $fullName }} + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- end }} diff --git a/kubernetes/templates/proxypool-service.yaml b/kubernetes/templates/proxypool-service.yaml new file mode 100644 index 00000000..3d4285b4 --- /dev/null +++ b/kubernetes/templates/proxypool-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "proxypool.fullname" . }} + labels: + {{- include "proxypool.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: 5555 + protocol: TCP + name: http + selector: + {{- include "proxypool.selectorLabels" . | nindent 4 }} diff --git a/kubernetes/templates/redis-deployment.yaml b/kubernetes/templates/redis-deployment.yaml new file mode 100644 index 00000000..4acf4351 --- /dev/null +++ b/kubernetes/templates/redis-deployment.yaml @@ -0,0 +1,30 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: proxypool-redis + name: proxypool-redis +spec: + replicas: 1 + revisionHistoryLimit: 1 + selector: + matchLabels: + app: proxypool-redis + template: + metadata: + labels: + app: proxypool-redis + spec: + containers: + - image: redis:alpine + name: proxypool-redis + ports: + - containerPort: 6379 + resources: + limits: + memory: "100Mi" + cpu: "100m" + requests: + memory: "100Mi" + cpu: "100m" + restartPolicy: Always diff --git a/kubernetes/templates/redis-service.yaml b/kubernetes/templates/redis-service.yaml new file mode 100644 index 00000000..5dbda554 --- /dev/null +++ b/kubernetes/templates/redis-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app: proxypool-redis + name: proxypool-redis +spec: + ports: + - name: "6379" + port: 6379 + targetPort: 6379 + selector: + app: proxypool-redis \ No newline at end of file diff --git a/kubernetes/values.yaml b/kubernetes/values.yaml new file mode 100644 index 00000000..1a7421d1 --- /dev/null +++ b/kubernetes/values.yaml @@ -0,0 +1,37 @@ +name: proxypool +fullname: proxypool-app + +deployment: + image: germey/proxypool:master + imagePullPolicy: Always + restartPolicy: Always + revisionHistoryLimit: 2 + successfulJobsHistoryLimit: 1 + replicas: 1 + resources: + limits: + memory: "200Mi" + cpu: "80m" + requests: + memory: "200Mi" + cpu: "80m" + env: + - name: REDIS_HOST + value: "proxypool-redis" + +service: + type: ClusterIP + port: 80 + +ingress: + enabled: true + annotations: + kubernetes.io/ingress.class: nginx + hosts: + - host: proxypool.scrape.center + paths: + - "/" + tls: + - secretName: tls-wildcard-scrape-center + hosts: + - proxypool.scrape.center From b40f033f6aeb6dc3743581d7313f87595f9b5317 Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Fri, 24 Dec 2021 14:51:01 +0800 Subject: [PATCH 26/65] rename --- .github/workflows/build.yml | 2 +- .github/workflows/deploy.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d1724a5a..966d6d23 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -16,7 +16,7 @@ jobs: - name: Docker Login run: docker login -u germey -p ${{ secrets.DOCKERHUB_LOGIN_PASSWORD }} - name: Build the Docker Image - run: docker-compose -f build.yml build + run: docker-compose -f build.yaml build - name: Tag and Push Master Version run: | docker tag germey/proxypool germey/proxypool:master diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index ac5a5c69..871642d7 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -35,7 +35,7 @@ jobs: echo $BUILD_NUMBER - name: Build Push Deploy run: | - docker-compose -f build.yml build + docker-compose -f build.yaml build docker tag germey/proxypool germey/proxypool:$BUILD_NUMBER docker push germey/proxypool docker push germey/proxypool:$BUILD_NUMBER From 3abbf218a7c45c9be2b1f44d6bd980fd398f6867 Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Fri, 24 Dec 2021 14:54:25 +0800 Subject: [PATCH 27/65] update readme --- kubernetes/README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kubernetes/README.md b/kubernetes/README.md index 70860d24..327880df 100644 --- a/kubernetes/README.md +++ b/kubernetes/README.md @@ -4,6 +4,11 @@ 首先需要有一个 Kubernetes 集群,其次需要安装 Helm,确保 helm 命令可以正常运行。 +安装参考: + +- Kubernetes:[https://setup.scrape.center/kubernetes](https://setup.scrape.center/kubernetes)。 +- Helm: [https://setup.scrape.center/helm](https://setup.scrape.center/helm)。 + ## 安装 安装直接使用 helm 命令在本文件夹运行即可,使用 `-n` 可以制定 NameSpace。 From 598d02dcf8321f9a706ce0ee4ce5ef12ed8e4927 Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Fri, 24 Dec 2021 15:22:28 +0800 Subject: [PATCH 28/65] update --- Dockerfile | 2 +- kubernetes/templates/proxypool-deployment.yaml | 8 ++++++++ proxypool/crawlers/base.py | 8 +++++--- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0db474f1..1eb790fe 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM nginx:alpine +FROM alpine:3.7 WORKDIR /app RUN apk add --no-cache --virtual .build-deps g++ python3-dev libffi-dev \ openssl-dev libxml2-dev libxslt-dev gcc musl-dev py3-pip && \ diff --git a/kubernetes/templates/proxypool-deployment.yaml b/kubernetes/templates/proxypool-deployment.yaml index 29ed068e..a12854d9 100644 --- a/kubernetes/templates/proxypool-deployment.yaml +++ b/kubernetes/templates/proxypool-deployment.yaml @@ -23,6 +23,14 @@ spec: - containerPort: 5555 protocol: TCP imagePullPolicy: {{ .Values.deployment.imagePullPolicy }} + livenessProbe: + httpGet: + path: /random + port: 5555 + initialDelaySeconds: 60 + periodSeconds: 5 + failureThreshold: 5 + timeoutSeconds: 10 resources: {{- toYaml .Values.deployment.resources | nindent 12 }} env: diff --git a/proxypool/crawlers/base.py b/proxypool/crawlers/base.py index 563d49bb..69d13fcd 100644 --- a/proxypool/crawlers/base.py +++ b/proxypool/crawlers/base.py @@ -4,9 +4,11 @@ from proxypool.setting import GET_TIMEOUT from fake_headers import Headers import time + + class BaseCrawler(object): urls = [] - + @retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None, wait_fixed=2000) def fetch(self, url, **kwargs): try: @@ -14,13 +16,13 @@ def fetch(self, url, **kwargs): kwargs.setdefault('timeout', GET_TIMEOUT) kwargs.setdefault('verify', False) kwargs.setdefault('headers', headers) - response = requests.get(url ,**kwargs) + response = requests.get(url, **kwargs) if response.status_code == 200: response.encoding = 'utf-8' return response.text except requests.ConnectionError: return - + @logger.catch def crawl(self): """ From f7cf6007994fce9da9050d887346b30d577802c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=94=E5=BA=86=E6=89=8D=E4=B8=A8=E9=9D=99=E8=A7=85?= <cqc@cuiqingcai.com> Date: Mon, 27 Dec 2021 02:46:05 +0800 Subject: [PATCH 29/65] Update build.yml --- .github/workflows/build.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 966d6d23..641861f1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,6 +19,5 @@ jobs: run: docker-compose -f build.yaml build - name: Tag and Push Master Version run: | - docker tag germey/proxypool germey/proxypool:master docker push germey/proxypool:master From 08385f6463b3b1cc4be8654bdb0a3dffae131854 Mon Sep 17 00:00:00 2001 From: Germey <qicu@microsoft.com> Date: Tue, 28 Dec 2021 16:18:46 +0800 Subject: [PATCH 30/65] catch retry error --- proxypool/crawlers/base.py | 28 +++++++++++++------ proxypool/crawlers/public/data5u.py | 16 ----------- .../{fatezero_proxylist.py => fatezero.py} | 0 proxypool/crawlers/public/ihuan.py | 1 + proxypool/crawlers/public/jiangxianli.py | 12 +++++--- proxypool/crawlers/public/xiaoshudaili.py | 15 ++++++---- proxypool/crawlers/public/xicidaili.py | 17 ----------- proxypool/crawlers/public/zhandaye.py | 2 +- proxypool/processors/getter.py | 6 ++-- 9 files changed, 42 insertions(+), 55 deletions(-) rename proxypool/crawlers/public/{fatezero_proxylist.py => fatezero.py} (100%) diff --git a/proxypool/crawlers/base.py b/proxypool/crawlers/base.py index 69d13fcd..4a4bf5cd 100644 --- a/proxypool/crawlers/base.py +++ b/proxypool/crawlers/base.py @@ -1,4 +1,4 @@ -from retrying import retry +from retrying import RetryError, retry import requests from loguru import logger from proxypool.setting import GET_TIMEOUT @@ -23,15 +23,25 @@ def fetch(self, url, **kwargs): except requests.ConnectionError: return - @logger.catch + def process(self, html, url): + """ + used for parse html + """ + for proxy in self.parse(html): + logger.info(f'fetched proxy {proxy.string()} from {url}') + yield proxy + def crawl(self): """ crawl main method """ - for url in self.urls: - logger.info(f'fetching {url}') - html = self.fetch(url) - time.sleep(.5) - for proxy in self.parse(html): - logger.info(f'fetched proxy {proxy.string()} from {url}') - yield proxy + try: + for url in self.urls: + logger.info(f'fetching {url}') + html = self.fetch(url) + time.sleep(.5) + yield from self.process(html, url) + except RetryError: + logger.error( + f'crawler {self} crawled proxy unsuccessfully, ' + 'please check if target url is valid or network issue') diff --git a/proxypool/crawlers/public/data5u.py b/proxypool/crawlers/public/data5u.py index e36bf664..62158c20 100644 --- a/proxypool/crawlers/public/data5u.py +++ b/proxypool/crawlers/public/data5u.py @@ -11,23 +11,7 @@ class Data5UCrawler(BaseCrawler): data5u crawler, http://www.data5u.com """ urls = [BASE_URL] - - headers = { - 'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36' - } - @logger.catch - def crawl(self): - """ - crawl main method - """ - for url in self.urls: - logger.info(f'fetching {url}') - html = self.fetch(url, headers=self.headers) - for proxy in self.parse(html): - logger.info(f'fetched proxy {proxy.string()} from {url}') - yield proxy - def parse(self, html): """ parse html file to get proxies diff --git a/proxypool/crawlers/public/fatezero_proxylist.py b/proxypool/crawlers/public/fatezero.py similarity index 100% rename from proxypool/crawlers/public/fatezero_proxylist.py rename to proxypool/crawlers/public/fatezero.py diff --git a/proxypool/crawlers/public/ihuan.py b/proxypool/crawlers/public/ihuan.py index ccf90a13..4ca5e529 100644 --- a/proxypool/crawlers/public/ihuan.py +++ b/proxypool/crawlers/public/ihuan.py @@ -13,6 +13,7 @@ class IhuanCrawler(BaseCrawler): path = time.strftime("%Y/%m/%d/%H", time.localtime()) urls = [BASE_URL.format(path=path)] ignore = False + def parse(self, html): """ parse html file to get proxies diff --git a/proxypool/crawlers/public/jiangxianli.py b/proxypool/crawlers/public/jiangxianli.py index 14fc46cc..04533e3b 100644 --- a/proxypool/crawlers/public/jiangxianli.py +++ b/proxypool/crawlers/public/jiangxianli.py @@ -1,23 +1,27 @@ from proxypool.schemas.proxy import Proxy from proxypool.crawlers.base import BaseCrawler -import re import json + + BASE_URL = 'https://ip.jiangxianli.com/api/proxy_ips?page={page}' MAX_PAGE = 10 + + class JiangxianliCrawler(BaseCrawler): """ jiangxianli crawler,https://ip.jiangxianli.com/ """ + urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)] - + def parse(self, html): """ parse html file to get proxies :return: """ - - result =json.loads(html) + + result = json.loads(html) if result['code'] != 0: return MAX_PAGE = int(result['data']['last_page']) diff --git a/proxypool/crawlers/public/xiaoshudaili.py b/proxypool/crawlers/public/xiaoshudaili.py index bb4d7d42..44977605 100644 --- a/proxypool/crawlers/public/xiaoshudaili.py +++ b/proxypool/crawlers/public/xiaoshudaili.py @@ -1,7 +1,5 @@ import re - from pyquery import PyQuery as pq - from proxypool.schemas.proxy import Proxy from proxypool.crawlers.base import BaseCrawler @@ -16,16 +14,23 @@ class XiaoShuCrawler(BaseCrawler): """ def __init__(self): - html = self.fetch(url=BASE_URL) + """ + init urls + """ + try: + html = self.fetch(url=BASE_URL) + except: + self.urls = [] + return doc = pq(html) title = doc(".title:eq(0) a").items() - latest_page = 0 for t in title: res = re.search(r"/(\d+)\.html", t.attr("href")) latest_page = int(res.group(1)) if res else 0 if latest_page: - self.urls = [PAGE_BASE_URL.format(page=page) for page in range(latest_page - MAX_PAGE, latest_page)] + self.urls = [PAGE_BASE_URL.format(page=page) for page in range( + latest_page - MAX_PAGE, latest_page)] else: self.urls = [] diff --git a/proxypool/crawlers/public/xicidaili.py b/proxypool/crawlers/public/xicidaili.py index fdd2a317..53a4872e 100644 --- a/proxypool/crawlers/public/xicidaili.py +++ b/proxypool/crawlers/public/xicidaili.py @@ -12,23 +12,7 @@ class XicidailiCrawler(BaseCrawler): """ urls = [BASE_URL] ignore = True - - headers = { - 'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36' - } - @logger.catch - def crawl(self): - """ - crawl main method - """ - for url in self.urls: - logger.info(f'fetching {url}') - html = self.fetch(url, headers=self.headers) - for proxy in self.parse(html): - logger.info(f'fetched proxy {proxy.string()} from {url}') - yield proxy - def parse(self, html): """ parse html file to get proxies @@ -49,4 +33,3 @@ def parse(self, html): crawler = XicidailiCrawler() for proxy in crawler.crawl(): print(proxy) - diff --git a/proxypool/crawlers/public/zhandaye.py b/proxypool/crawlers/public/zhandaye.py index 83af04b6..1522cdf0 100755 --- a/proxypool/crawlers/public/zhandaye.py +++ b/proxypool/crawlers/public/zhandaye.py @@ -8,6 +8,7 @@ BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html' MAX_PAGE = 5 * 2 + class ZhandayeCrawler(BaseCrawler): """ zhandaye crawler, https://www.zdaye.com/dayProxy/ @@ -56,4 +57,3 @@ def parse(self, html): crawler = ZhandayeCrawler() for proxy in crawler.crawl(): print(proxy) - diff --git a/proxypool/processors/getter.py b/proxypool/processors/getter.py index 1a1d5261..877e198a 100644 --- a/proxypool/processors/getter.py +++ b/proxypool/processors/getter.py @@ -8,7 +8,7 @@ class Getter(object): """ getter of proxypool """ - + def __init__(self): """ init db and crawlers @@ -16,14 +16,14 @@ def __init__(self): self.redis = RedisClient() self.crawlers_cls = crawlers_cls self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls] - + def is_full(self): """ if proxypool if full return: bool """ return self.redis.count() >= PROXY_NUMBER_MAX - + @logger.catch def run(self): """ From 642a59571621788d4a7fef7218b05a8a2694fd3a Mon Sep 17 00:00:00 2001 From: "Hiroshi.tao" <me@tcw.im> Date: Thu, 30 Dec 2021 13:43:05 +0800 Subject: [PATCH 31/65] update dockerfile and build.yaml (#135) --- .dockerignore | 4 ++- .github/workflows/build.yml | 49 ++++++++++++++++++++++++++----------- .gitignore | 3 ++- Dockerfile | 24 ++++++++++-------- supervisord.conf | 11 +++++++++ 5 files changed, 65 insertions(+), 26 deletions(-) diff --git a/.dockerignore b/.dockerignore index f9306766..3eab792f 100644 --- a/.dockerignore +++ b/.dockerignore @@ -130,4 +130,6 @@ dmypy.json # Pyre type checker .pyre/ -proxypool/.env \ No newline at end of file +proxypool/.env +.DS_Store +.vscode \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 641861f1..1e5caef8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -2,22 +2,43 @@ name: build on: push: branches: - - master + - master paths-ignore: - - .gitignore - - README.md - - '.github/ISSUE_TEMPLATE/**' + - .gitignore + - README.md + - '.github/ISSUE_TEMPLATE/**' + release: + types: [published] + jobs: build: runs-on: ubuntu-latest steps: - - name: Checkout Source - uses: actions/checkout@v1 - - name: Docker Login - run: docker login -u germey -p ${{ secrets.DOCKERHUB_LOGIN_PASSWORD }} - - name: Build the Docker Image - run: docker-compose -f build.yaml build - - name: Tag and Push Master Version - run: | - docker push germey/proxypool:master - + - name: Checkout + uses: actions/checkout@v2 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Login to DockerHub + uses: docker/login-action@v1 + with: + username: germey + password: ${{ secrets.DOCKERHUB_LOGIN_PASSWORD }} + + - name: Extract branch name + id: branch + shell: bash + run: echo "##[set-output name=tag;]$(echo ${GITHUB_REF##*/} | sed 's/master/latest/')" + + - name: Build and push + uses: docker/build-push-action@v2 + with: + context: . + push: true + platforms: linux/amd64 + tags: | + germey/proxypool:${{ steps.branch.outputs.tag }} diff --git a/.gitignore b/.gitignore index 7f21799f..16a7490c 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ *.db venv /.idea -*.log \ No newline at end of file +*.log +.DS_Store \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 1eb790fe..68ce7b40 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,17 @@ -FROM alpine:3.7 -WORKDIR /app -RUN apk add --no-cache --virtual .build-deps g++ python3-dev libffi-dev \ - openssl-dev libxml2-dev libxslt-dev gcc musl-dev py3-pip && \ - apk add --no-cache --update python3 && \ - pip3 install --upgrade pip setuptools +FROM python:3.7-alpine AS build COPY requirements.txt . -RUN pip3 install -r requirements.txt && \ -apk del g++ gcc musl-dev libxml2-dev +RUN apk update &&\ + apk add --no-cache gcc g++ libffi-dev openssl-dev libxml2-dev libxslt-dev &&\ + pip install --timeout 30 --user --no-cache-dir --no-warn-script-location -r requirements.txt + +FROM python:3.7-alpine +ENV APP_ENV=prod +ENV LOCAL_PKG="/root/.local" +COPY --from=build ${LOCAL_PKG} ${LOCAL_PKG} +RUN apk update && apk add --no-cache libffi-dev openssl-dev libxslt-dev &&\ + ln -sf ${LOCAL_PKG}/bin/* /usr/local/bin/ +WORKDIR /app COPY . . -# RUN pip install -r requirements.txt -i https://pypi.douban.com/simple +EXPOSE 5555 VOLUME ["/app/proxypool/crawlers/private"] -CMD ["supervisord", "-c", "supervisord.conf"] +ENTRYPOINT ["supervisord", "-c", "supervisord.conf"] \ No newline at end of file diff --git a/supervisord.conf b/supervisord.conf index 97f2f86f..aff2cd64 100644 --- a/supervisord.conf +++ b/supervisord.conf @@ -1,6 +1,17 @@ +[unix_http_server] +file=/run/supervisor.sock +chmod=0700 + [supervisord] +pidfile=/run/supervisord.pid nodaemon=true +[supervisorctl] +serverurl=unix:///run/supervisor.sock + +[rpcinterface:supervisor] +supervisor.rpcinterface_factory=supervisor.rpcinterface:make_main_rpcinterface + [program:tester] process_name=tester command=python3 run.py --processor tester From 8a21396633c6d2455e411dbd9cc3509e6daad7b4 Mon Sep 17 00:00:00 2001 From: Germey <qicu@microsoft.com> Date: Thu, 30 Dec 2021 14:18:28 +0800 Subject: [PATCH 32/65] add env prefix --- proxypool/scheduler.py | 50 +++++++++++++++++++++---------------- proxypool/setting.py | 20 +++++++-------- proxypool/storages/redis.py | 20 ++++++++++----- proxypool/utils/proxy.py | 6 +++++ 4 files changed, 58 insertions(+), 38 deletions(-) diff --git a/proxypool/scheduler.py b/proxypool/scheduler.py index bec0c595..46a6b6d0 100644 --- a/proxypool/scheduler.py +++ b/proxypool/scheduler.py @@ -18,7 +18,7 @@ class Scheduler(): """ scheduler """ - + def run_tester(self, cycle=CYCLE_TESTER): """ run tester @@ -33,7 +33,7 @@ def run_tester(self, cycle=CYCLE_TESTER): tester.run() loop += 1 time.sleep(cycle) - + def run_getter(self, cycle=CYCLE_GETTER): """ run getter @@ -48,7 +48,7 @@ def run_getter(self, cycle=CYCLE_GETTER): getter.run() loop += 1 time.sleep(cycle) - + def run_server(self): """ run server for api @@ -57,42 +57,48 @@ def run_server(self): logger.info('server not enabled, exit') return app.run(host=API_HOST, port=API_PORT, threaded=API_THREADED) - + def run(self): global tester_process, getter_process, server_process try: logger.info('starting proxypool...') if ENABLE_TESTER: - tester_process = multiprocessing.Process(target=self.run_tester) + tester_process = multiprocessing.Process( + target=self.run_tester) logger.info(f'starting tester, pid {tester_process.pid}...') tester_process.start() - + if ENABLE_GETTER: - getter_process = multiprocessing.Process(target=self.run_getter) + getter_process = multiprocessing.Process( + target=self.run_getter) logger.info(f'starting getter, pid{getter_process.pid}...') getter_process.start() - + if ENABLE_SERVER: - server_process = multiprocessing.Process(target=self.run_server) + server_process = multiprocessing.Process( + target=self.run_server) logger.info(f'starting server, pid{server_process.pid}...') server_process.start() - - tester_process.join() - getter_process.join() - server_process.join() + + tester_process and tester_process.join() + getter_process and getter_process.join() + server_process and server_process.join() except KeyboardInterrupt: logger.info('received keyboard interrupt signal') - tester_process.terminate() - getter_process.terminate() - server_process.terminate() + tester_process and tester_process.terminate() + getter_process and getter_process.terminate() + server_process and server_process.terminate() finally: # must call join method before calling is_alive - tester_process.join() - getter_process.join() - server_process.join() - logger.info(f'tester is {"alive" if tester_process.is_alive() else "dead"}') - logger.info(f'getter is {"alive" if getter_process.is_alive() else "dead"}') - logger.info(f'server is {"alive" if server_process.is_alive() else "dead"}') + tester_process and tester_process.join() + getter_process and getter_process.join() + server_process and server_process.join() + logger.info( + f'tester is {"alive" if tester_process.is_alive() else "dead"}') + logger.info( + f'getter is {"alive" if getter_process.is_alive() else "dead"}') + logger.info( + f'server is {"alive" if server_process.is_alive() else "dead"}') logger.info('proxy terminated') diff --git a/proxypool/setting.py b/proxypool/setting.py index 5765714b..d6d2d8ca 100644 --- a/proxypool/setting.py +++ b/proxypool/setting.py @@ -24,18 +24,19 @@ APP_TEST = IS_TEST = APP_ENV == TEST_MODE # redis host -REDIS_HOST = env.str('REDIS_HOST', '127.0.0.1') +REDIS_HOST = env.str('PROXYPOOL_REDIS_HOST', + env.str('REDIS_HOST', '127.0.0.1')) # redis port -REDIS_PORT = env.int('REDIS_PORT', 6379) +REDIS_PORT = env.int('PROXYPOOL_REDIS_PORT', env.int('REDIS_PORT', 6379)) # redis password, if no password, set it to None -REDIS_PASSWORD = env.str('REDIS_PASSWORD', None) +REDIS_PASSWORD = env.str('PROXYPOOL_REDIS_PASSWORD', + env.str('REDIS_PASSWORD', None)) # redis db, if no choice, set it to 0 -REDIS_DB = env.int('REDIS_DB', 0) -# redis connection string, like redis://[password]@host:port or rediss://[password]@host:port/0 -REDIS_CONNECTION_STRING = env.str('REDIS_CONNECTION_STRING', None) - -if REDIS_CONNECTION_STRING: - REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_DB = parse_redis_connection_string(REDIS_CONNECTION_STRING) +REDIS_DB = env.int('PROXYPOOL_REDIS_DB', env.int('REDIS_DB', 0)) +# redis connection string, like redis://[password]@host:port or rediss://[password]@host:port/0, +# please refer to https://redis-py.readthedocs.io/en/stable/connections.html#redis.client.Redis.from_url +REDIS_CONNECTION_STRING = env.str( + 'PROXYPOOL_REDIS_CONNECTION_STRING', env.str('REDIS_CONNECTION_STRING', None)) # redis hash table key name REDIS_KEY = env.str('REDIS_KEY', 'proxies:universal') @@ -78,4 +79,3 @@ # logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), level='DEBUG', rotation='1 week', retention='20 days') # logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), level='ERROR', rotation='1 week') - diff --git a/proxypool/storages/redis.py b/proxypool/storages/redis.py index 0ebbccc2..5ae145bc 100644 --- a/proxypool/storages/redis.py +++ b/proxypool/storages/redis.py @@ -1,7 +1,7 @@ import redis from proxypool.exceptions import PoolEmptyException from proxypool.schemas.proxy import Proxy -from proxypool.setting import REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_DB, REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MIN, \ +from proxypool.setting import REDIS_CONNECTION_STRING, REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_DB, REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MIN, \ PROXY_SCORE_INIT from random import choice from typing import List @@ -18,14 +18,21 @@ class RedisClient(object): redis connection client of proxypool """ - def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=REDIS_DB, **kwargs): + def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=REDIS_DB, + connection_string=REDIS_CONNECTION_STRING, **kwargs): """ init redis client :param host: redis host :param port: redis port :param password: redis password + :param connection_string: redis connection_string """ - self.db = redis.StrictRedis(host=host, port=port, password=password, db=db, decode_responses=True, **kwargs) + # if set connection_string, just use it + if connection_string: + self.db = redis.StrictRedis.from_url(connection_string) + else: + self.db = redis.StrictRedis( + host=host, port=port, password=password, db=db, decode_responses=True, **kwargs) def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int: """ @@ -51,11 +58,13 @@ def random(self) -> Proxy: :return: proxy, like 8.8.8.8:8 """ # try to get proxy with max score - proxies = self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MAX , PROXY_SCORE_MAX) + proxies = self.db.zrangebyscore( + REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MAX) if len(proxies): return convert_proxy_or_proxies(choice(proxies)) # else get proxy by rank - proxies = self.db.zrevrange(REDIS_KEY, PROXY_SCORE_MIN , PROXY_SCORE_MAX) + proxies = self.db.zrevrange( + REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX) if len(proxies): return convert_proxy_or_proxies(choice(proxies)) # else raise error @@ -125,4 +134,3 @@ def batch(self, cursor, count) -> List[Proxy]: conn = RedisClient() result = conn.random() print(result) - diff --git a/proxypool/utils/proxy.py b/proxypool/utils/proxy.py index b22e05a4..294033fc 100644 --- a/proxypool/utils/proxy.py +++ b/proxypool/utils/proxy.py @@ -2,6 +2,9 @@ def is_valid_proxy(data): + """ + check this string is within proxy format + """ if data.__contains__(':'): ip = data.split(':')[0] port = data.split(':')[1] @@ -11,6 +14,9 @@ def is_valid_proxy(data): def is_ip_valid(ip): + """ + check this string is within ip format + """ a = ip.split('.') if len(a) != 4: return False From 00d438809cbb454228422da02da89e61039951c7 Mon Sep 17 00:00:00 2001 From: Germey <qicu@microsoft.com> Date: Thu, 30 Dec 2021 14:44:31 +0800 Subject: [PATCH 33/65] add prefix --- build.yaml | 3 +-- docker-compose.yml | 2 +- proxypool/setting.py | 4 ++-- proxypool/utils/parse.py | 13 ------------- 4 files changed, 4 insertions(+), 18 deletions(-) delete mode 100644 proxypool/utils/parse.py diff --git a/build.yaml b/build.yaml index 82fd9559..74b2fd0b 100644 --- a/build.yaml +++ b/build.yaml @@ -5,7 +5,6 @@ services: container_name: redis4proxypool ports: - "6374:6379" - # restart: always proxypool: build: . image: "germey/proxypool:master" @@ -16,4 +15,4 @@ services: # volumes: # - proxypool/crawlers/private:/app/proxypool/crawlers/private environment: - REDIS_HOST: redis4proxypool + PROXYPOOL_REDIS_CONNECTION_STRING: redis://@redis4proxypool:6379/0 diff --git a/docker-compose.yml b/docker-compose.yml index 3c57f280..0b369788 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -14,4 +14,4 @@ services: # volumes: # - proxypool/crawlers/private:/app/proxypool/crawlers/private environment: - REDIS_HOST: redis4proxypool + PROXYPOOL_REDIS_HOST: redis4proxypool diff --git a/proxypool/setting.py b/proxypool/setting.py index d6d2d8ca..d11771f4 100644 --- a/proxypool/setting.py +++ b/proxypool/setting.py @@ -2,7 +2,6 @@ from os.path import dirname, abspath, join from environs import Env from loguru import logger -from proxypool.utils.parse import parse_redis_connection_string env = Env() @@ -39,7 +38,8 @@ 'PROXYPOOL_REDIS_CONNECTION_STRING', env.str('REDIS_CONNECTION_STRING', None)) # redis hash table key name -REDIS_KEY = env.str('REDIS_KEY', 'proxies:universal') +REDIS_KEY = env.str('PROXYPOOL_REDIS_KEY', env.str( + 'REDIS_KEY', 'proxies:universal')) # definition of proxy scores PROXY_SCORE_MAX = 100 diff --git a/proxypool/utils/parse.py b/proxypool/utils/parse.py deleted file mode 100644 index b3f42f5f..00000000 --- a/proxypool/utils/parse.py +++ /dev/null @@ -1,13 +0,0 @@ -import re - -def parse_redis_connection_string(connection_string): - """ - parse a redis connection string, for example: - redis://[password]@host:port - rediss://[password]@host:port - :param connection_string: - :return: - """ - result = re.match('rediss?:\/\/(.*?)@(.*?):(\d+)\/(\d+)', connection_string) - return result.group(2), int(result.group(3)), (result.group(1) or None), (result.group(4) or 0) if result \ - else ('localhost', 6379, None) From 04949758adf81f83ca3b7ab3c8396283c2ff57d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=94=E5=BA=86=E6=89=8D=E4=B8=A8=E9=9D=99=E8=A7=85?= <cqc@cuiqingcai.com> Date: Thu, 30 Dec 2021 14:47:56 +0800 Subject: [PATCH 34/65] Add prefix for redis env (#137) * add env prefix * add prefix Co-authored-by: Germey <qicu@microsoft.com> --- build.yaml | 3 +-- docker-compose.yml | 2 +- proxypool/scheduler.py | 50 +++++++++++++++++++++---------------- proxypool/setting.py | 24 +++++++++--------- proxypool/storages/redis.py | 20 ++++++++++----- proxypool/utils/parse.py | 13 ---------- proxypool/utils/proxy.py | 6 +++++ 7 files changed, 62 insertions(+), 56 deletions(-) delete mode 100644 proxypool/utils/parse.py diff --git a/build.yaml b/build.yaml index 82fd9559..74b2fd0b 100644 --- a/build.yaml +++ b/build.yaml @@ -5,7 +5,6 @@ services: container_name: redis4proxypool ports: - "6374:6379" - # restart: always proxypool: build: . image: "germey/proxypool:master" @@ -16,4 +15,4 @@ services: # volumes: # - proxypool/crawlers/private:/app/proxypool/crawlers/private environment: - REDIS_HOST: redis4proxypool + PROXYPOOL_REDIS_CONNECTION_STRING: redis://@redis4proxypool:6379/0 diff --git a/docker-compose.yml b/docker-compose.yml index 3c57f280..0b369788 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -14,4 +14,4 @@ services: # volumes: # - proxypool/crawlers/private:/app/proxypool/crawlers/private environment: - REDIS_HOST: redis4proxypool + PROXYPOOL_REDIS_HOST: redis4proxypool diff --git a/proxypool/scheduler.py b/proxypool/scheduler.py index bec0c595..46a6b6d0 100644 --- a/proxypool/scheduler.py +++ b/proxypool/scheduler.py @@ -18,7 +18,7 @@ class Scheduler(): """ scheduler """ - + def run_tester(self, cycle=CYCLE_TESTER): """ run tester @@ -33,7 +33,7 @@ def run_tester(self, cycle=CYCLE_TESTER): tester.run() loop += 1 time.sleep(cycle) - + def run_getter(self, cycle=CYCLE_GETTER): """ run getter @@ -48,7 +48,7 @@ def run_getter(self, cycle=CYCLE_GETTER): getter.run() loop += 1 time.sleep(cycle) - + def run_server(self): """ run server for api @@ -57,42 +57,48 @@ def run_server(self): logger.info('server not enabled, exit') return app.run(host=API_HOST, port=API_PORT, threaded=API_THREADED) - + def run(self): global tester_process, getter_process, server_process try: logger.info('starting proxypool...') if ENABLE_TESTER: - tester_process = multiprocessing.Process(target=self.run_tester) + tester_process = multiprocessing.Process( + target=self.run_tester) logger.info(f'starting tester, pid {tester_process.pid}...') tester_process.start() - + if ENABLE_GETTER: - getter_process = multiprocessing.Process(target=self.run_getter) + getter_process = multiprocessing.Process( + target=self.run_getter) logger.info(f'starting getter, pid{getter_process.pid}...') getter_process.start() - + if ENABLE_SERVER: - server_process = multiprocessing.Process(target=self.run_server) + server_process = multiprocessing.Process( + target=self.run_server) logger.info(f'starting server, pid{server_process.pid}...') server_process.start() - - tester_process.join() - getter_process.join() - server_process.join() + + tester_process and tester_process.join() + getter_process and getter_process.join() + server_process and server_process.join() except KeyboardInterrupt: logger.info('received keyboard interrupt signal') - tester_process.terminate() - getter_process.terminate() - server_process.terminate() + tester_process and tester_process.terminate() + getter_process and getter_process.terminate() + server_process and server_process.terminate() finally: # must call join method before calling is_alive - tester_process.join() - getter_process.join() - server_process.join() - logger.info(f'tester is {"alive" if tester_process.is_alive() else "dead"}') - logger.info(f'getter is {"alive" if getter_process.is_alive() else "dead"}') - logger.info(f'server is {"alive" if server_process.is_alive() else "dead"}') + tester_process and tester_process.join() + getter_process and getter_process.join() + server_process and server_process.join() + logger.info( + f'tester is {"alive" if tester_process.is_alive() else "dead"}') + logger.info( + f'getter is {"alive" if getter_process.is_alive() else "dead"}') + logger.info( + f'server is {"alive" if server_process.is_alive() else "dead"}') logger.info('proxy terminated') diff --git a/proxypool/setting.py b/proxypool/setting.py index 5765714b..d11771f4 100644 --- a/proxypool/setting.py +++ b/proxypool/setting.py @@ -2,7 +2,6 @@ from os.path import dirname, abspath, join from environs import Env from loguru import logger -from proxypool.utils.parse import parse_redis_connection_string env = Env() @@ -24,21 +23,23 @@ APP_TEST = IS_TEST = APP_ENV == TEST_MODE # redis host -REDIS_HOST = env.str('REDIS_HOST', '127.0.0.1') +REDIS_HOST = env.str('PROXYPOOL_REDIS_HOST', + env.str('REDIS_HOST', '127.0.0.1')) # redis port -REDIS_PORT = env.int('REDIS_PORT', 6379) +REDIS_PORT = env.int('PROXYPOOL_REDIS_PORT', env.int('REDIS_PORT', 6379)) # redis password, if no password, set it to None -REDIS_PASSWORD = env.str('REDIS_PASSWORD', None) +REDIS_PASSWORD = env.str('PROXYPOOL_REDIS_PASSWORD', + env.str('REDIS_PASSWORD', None)) # redis db, if no choice, set it to 0 -REDIS_DB = env.int('REDIS_DB', 0) -# redis connection string, like redis://[password]@host:port or rediss://[password]@host:port/0 -REDIS_CONNECTION_STRING = env.str('REDIS_CONNECTION_STRING', None) - -if REDIS_CONNECTION_STRING: - REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_DB = parse_redis_connection_string(REDIS_CONNECTION_STRING) +REDIS_DB = env.int('PROXYPOOL_REDIS_DB', env.int('REDIS_DB', 0)) +# redis connection string, like redis://[password]@host:port or rediss://[password]@host:port/0, +# please refer to https://redis-py.readthedocs.io/en/stable/connections.html#redis.client.Redis.from_url +REDIS_CONNECTION_STRING = env.str( + 'PROXYPOOL_REDIS_CONNECTION_STRING', env.str('REDIS_CONNECTION_STRING', None)) # redis hash table key name -REDIS_KEY = env.str('REDIS_KEY', 'proxies:universal') +REDIS_KEY = env.str('PROXYPOOL_REDIS_KEY', env.str( + 'REDIS_KEY', 'proxies:universal')) # definition of proxy scores PROXY_SCORE_MAX = 100 @@ -78,4 +79,3 @@ # logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), level='DEBUG', rotation='1 week', retention='20 days') # logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), level='ERROR', rotation='1 week') - diff --git a/proxypool/storages/redis.py b/proxypool/storages/redis.py index 0ebbccc2..5ae145bc 100644 --- a/proxypool/storages/redis.py +++ b/proxypool/storages/redis.py @@ -1,7 +1,7 @@ import redis from proxypool.exceptions import PoolEmptyException from proxypool.schemas.proxy import Proxy -from proxypool.setting import REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_DB, REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MIN, \ +from proxypool.setting import REDIS_CONNECTION_STRING, REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_DB, REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MIN, \ PROXY_SCORE_INIT from random import choice from typing import List @@ -18,14 +18,21 @@ class RedisClient(object): redis connection client of proxypool """ - def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=REDIS_DB, **kwargs): + def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=REDIS_DB, + connection_string=REDIS_CONNECTION_STRING, **kwargs): """ init redis client :param host: redis host :param port: redis port :param password: redis password + :param connection_string: redis connection_string """ - self.db = redis.StrictRedis(host=host, port=port, password=password, db=db, decode_responses=True, **kwargs) + # if set connection_string, just use it + if connection_string: + self.db = redis.StrictRedis.from_url(connection_string) + else: + self.db = redis.StrictRedis( + host=host, port=port, password=password, db=db, decode_responses=True, **kwargs) def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int: """ @@ -51,11 +58,13 @@ def random(self) -> Proxy: :return: proxy, like 8.8.8.8:8 """ # try to get proxy with max score - proxies = self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MAX , PROXY_SCORE_MAX) + proxies = self.db.zrangebyscore( + REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MAX) if len(proxies): return convert_proxy_or_proxies(choice(proxies)) # else get proxy by rank - proxies = self.db.zrevrange(REDIS_KEY, PROXY_SCORE_MIN , PROXY_SCORE_MAX) + proxies = self.db.zrevrange( + REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX) if len(proxies): return convert_proxy_or_proxies(choice(proxies)) # else raise error @@ -125,4 +134,3 @@ def batch(self, cursor, count) -> List[Proxy]: conn = RedisClient() result = conn.random() print(result) - diff --git a/proxypool/utils/parse.py b/proxypool/utils/parse.py deleted file mode 100644 index b3f42f5f..00000000 --- a/proxypool/utils/parse.py +++ /dev/null @@ -1,13 +0,0 @@ -import re - -def parse_redis_connection_string(connection_string): - """ - parse a redis connection string, for example: - redis://[password]@host:port - rediss://[password]@host:port - :param connection_string: - :return: - """ - result = re.match('rediss?:\/\/(.*?)@(.*?):(\d+)\/(\d+)', connection_string) - return result.group(2), int(result.group(3)), (result.group(1) or None), (result.group(4) or 0) if result \ - else ('localhost', 6379, None) diff --git a/proxypool/utils/proxy.py b/proxypool/utils/proxy.py index b22e05a4..294033fc 100644 --- a/proxypool/utils/proxy.py +++ b/proxypool/utils/proxy.py @@ -2,6 +2,9 @@ def is_valid_proxy(data): + """ + check this string is within proxy format + """ if data.__contains__(':'): ip = data.split(':')[0] port = data.split(':')[1] @@ -11,6 +14,9 @@ def is_valid_proxy(data): def is_ip_valid(ip): + """ + check this string is within ip format + """ a = ip.split('.') if len(a) != 4: return False From 1ccb847bbf03d01b06f016ccb8dd303f88a272cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=94=E5=BA=86=E6=89=8D=E4=B8=A8=E9=9D=99=E8=A7=85?= <cqc@cuiqingcai.com> Date: Thu, 30 Dec 2021 14:53:34 +0800 Subject: [PATCH 35/65] Update build.yml --- .github/workflows/build.yml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1e5caef8..f04456d6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -29,11 +29,6 @@ jobs: username: germey password: ${{ secrets.DOCKERHUB_LOGIN_PASSWORD }} - - name: Extract branch name - id: branch - shell: bash - run: echo "##[set-output name=tag;]$(echo ${GITHUB_REF##*/} | sed 's/master/latest/')" - - name: Build and push uses: docker/build-push-action@v2 with: @@ -41,4 +36,5 @@ jobs: push: true platforms: linux/amd64 tags: | - germey/proxypool:${{ steps.branch.outputs.tag }} + germey/proxypool:latest + germey/proxypool:master From 9c78753f2e6943244e74c0ba143c7db5755c8ab6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=94=E5=BA=86=E6=89=8D=E4=B8=A8=E9=9D=99=E8=A7=85?= <cqc@cuiqingcai.com> Date: Thu, 30 Dec 2021 15:01:50 +0800 Subject: [PATCH 36/65] Update build.yml --- .github/workflows/build.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f04456d6..fe439bc2 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -28,7 +28,11 @@ jobs: with: username: germey password: ${{ secrets.DOCKERHUB_LOGIN_PASSWORD }} - + + - name: Get current date + id: date + run: echo "::set-output name=date::$(date +'%Y%m%d')" + - name: Build and push uses: docker/build-push-action@v2 with: @@ -38,3 +42,4 @@ jobs: tags: | germey/proxypool:latest germey/proxypool:master + germey/proxypool:${{ steps.date.outputs.date }} From a5ff1833599bbf8bd915b83336bfafba02d4d11f Mon Sep 17 00:00:00 2001 From: Germey <qicu@microsoft.com> Date: Thu, 30 Dec 2021 15:09:41 +0800 Subject: [PATCH 37/65] update README --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 17a8fab3..7897c7b2 100644 --- a/README.md +++ b/README.md @@ -236,12 +236,12 @@ get random proxy 116.196.115.209:8080 ### Redis 连接 -- REDIS_HOST:Redis 的 Host -- REDIS_PORT:Redis 的端口 -- REDIS_PASSWORD:Redis 的密码 -- REDIS_DB:Redis 的数据库索引,如 0、1 -- REDIS_CONNECTION_STRING:Redis 连接字符串 -- REDIS_KEY:Redis 储存代理使用字典的名称 +- PROXYPOOL_REDIS_HOST / REDIS_HOST:Redis 的 Host,其中 PROXYPOOL_REDIS_HOST 会覆盖 REDIS_HOST 的值。 +- PROXYPOOL_REDIS_PORT / REDIS_PORT:Redis 的端口,其中 PROXYPOOL_REDIS_PORT 会覆盖 REDIS_PORT 的值。 +- PROXYPOOL_REDIS_PASSWORD / REDIS_PASSWORD:Redis 的密码,其中 PROXYPOOL_REDIS_PASSWORD 会覆盖 REDIS_PASSWORD 的值。 +- PROXYPOOL_REDIS_DB / REDIS_DB:Redis 的数据库索引,如 0、1,其中 PROXYPOOL_REDIS_DB 会覆盖 REDIS_DB 的值。 +- PROXYPOOL_REDIS_CONNECTION_STRING / REDIS_CONNECTION_STRING:Redis 连接字符串,其中 PROXYPOOL_REDIS_CONNECTION_STRING 会覆盖 REDIS_CONNECTION_STRING 的值。 +- PROXYPOOL_REDIS_KEY / REDIS_KEY:Redis 储存代理使用字典的名称,其中 PROXYPOOL_REDIS_KEY 会覆盖 REDIS_KEY 的值。 ### 处理器 From 43336d45c33b438d25886f6d8b4b9629fc29cf5e Mon Sep 17 00:00:00 2001 From: Germey <qicu@microsoft.com> Date: Thu, 30 Dec 2021 15:19:05 +0800 Subject: [PATCH 38/65] update release --- release.sh | 2 ++ 1 file changed, 2 insertions(+) create mode 100755 release.sh diff --git a/release.sh b/release.sh new file mode 100755 index 00000000..342cd06f --- /dev/null +++ b/release.sh @@ -0,0 +1,2 @@ +git tag -a "`date +'%Y%m%d'`" -m "Release `date +'%Y%m%d'`" +git push origin --tags \ No newline at end of file From f37f3ce46fbef656e061eceaa472ff87afc37c10 Mon Sep 17 00:00:00 2001 From: Germey <qicu@microsoft.com> Date: Fri, 31 Dec 2021 11:14:11 +0800 Subject: [PATCH 39/65] change max number --- proxypool/crawlers/public/daili66.py | 2 +- proxypool/crawlers/public/ip3366.py | 2 +- proxypool/crawlers/public/jiangxianli.py | 2 +- proxypool/crawlers/public/kuaidaili.py | 2 +- proxypool/crawlers/public/taiyangdaili.py | 2 +- proxypool/crawlers/public/xiaoshudaili.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/proxypool/crawlers/public/daili66.py b/proxypool/crawlers/public/daili66.py index 7b3bf7c2..aec7ea68 100644 --- a/proxypool/crawlers/public/daili66.py +++ b/proxypool/crawlers/public/daili66.py @@ -4,7 +4,7 @@ BASE_URL = 'http://www.66ip.cn/{page}.html' -MAX_PAGE = 50 +MAX_PAGE = 3 class Daili66Crawler(BaseCrawler): diff --git a/proxypool/crawlers/public/ip3366.py b/proxypool/crawlers/public/ip3366.py index 474a4f77..dfbc06f2 100644 --- a/proxypool/crawlers/public/ip3366.py +++ b/proxypool/crawlers/public/ip3366.py @@ -3,7 +3,7 @@ import re -MAX_PAGE = 8 +MAX_PAGE = 3 BASE_URL = 'http://www.ip3366.net/free/?stype={stype}&page={page}' diff --git a/proxypool/crawlers/public/jiangxianli.py b/proxypool/crawlers/public/jiangxianli.py index 04533e3b..861dd1e5 100644 --- a/proxypool/crawlers/public/jiangxianli.py +++ b/proxypool/crawlers/public/jiangxianli.py @@ -5,7 +5,7 @@ BASE_URL = 'https://ip.jiangxianli.com/api/proxy_ips?page={page}' -MAX_PAGE = 10 +MAX_PAGE = 3 class JiangxianliCrawler(BaseCrawler): diff --git a/proxypool/crawlers/public/kuaidaili.py b/proxypool/crawlers/public/kuaidaili.py index 71ab1717..3602833e 100644 --- a/proxypool/crawlers/public/kuaidaili.py +++ b/proxypool/crawlers/public/kuaidaili.py @@ -5,7 +5,7 @@ BASE_URL = 'https://www.kuaidaili.com/free/{type}/{page}/' -MAX_PAGE = 300 +MAX_PAGE = 3 class KuaidailiCrawler(BaseCrawler): diff --git a/proxypool/crawlers/public/taiyangdaili.py b/proxypool/crawlers/public/taiyangdaili.py index 7a48cb43..b42388cc 100644 --- a/proxypool/crawlers/public/taiyangdaili.py +++ b/proxypool/crawlers/public/taiyangdaili.py @@ -3,7 +3,7 @@ from pyquery import PyQuery as pq BaseUrl = 'http://www.taiyanghttp.com/free/page{num}' -MAX_PAGE = 5 +MAX_PAGE = 3 class TaiyangdailiCrawler(BaseCrawler): diff --git a/proxypool/crawlers/public/xiaoshudaili.py b/proxypool/crawlers/public/xiaoshudaili.py index 44977605..f6fd0869 100644 --- a/proxypool/crawlers/public/xiaoshudaili.py +++ b/proxypool/crawlers/public/xiaoshudaili.py @@ -5,7 +5,7 @@ BASE_URL = "http://www.xsdaili.cn/" PAGE_BASE_URL = "http://www.xsdaili.cn/dayProxy/ip/{page}.html" -MAX_PAGE = 50 +MAX_PAGE = 3 class XiaoShuCrawler(BaseCrawler): From ad82d46fcd101fee1a491e7b3b450031652b9509 Mon Sep 17 00:00:00 2001 From: "Hiroshi.tao" <me@tcw.im> Date: Fri, 31 Dec 2021 11:17:47 +0800 Subject: [PATCH 40/65] =?UTF-8?q?=E6=AD=A3=E5=BC=8F=E7=8E=AF=E5=A2=83?= =?UTF-8?q?=E6=94=AF=E6=8C=81=E5=A4=9A=E7=A7=8D=E6=96=B9=E6=B3=95=E8=BF=90?= =?UTF-8?q?=E8=A1=8Cserver=20(#138)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update dockerfile and build.yaml * update run app mode in prod - update readme * fix alpine python install gevent * fix #138 --- Dockerfile | 3 ++- README.md | 23 +++++++++--------- proxypool/processors/server.py | 9 ++++--- proxypool/scheduler.py | 44 ++++++++++++++++++++++++++++++---- proxypool/setting.py | 6 +++++ requirements.txt | 3 +++ 6 files changed, 68 insertions(+), 20 deletions(-) diff --git a/Dockerfile b/Dockerfile index 68ce7b40..c5ca5440 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,8 @@ FROM python:3.7-alpine AS build COPY requirements.txt . RUN apk update &&\ - apk add --no-cache gcc g++ libffi-dev openssl-dev libxml2-dev libxslt-dev &&\ + apk add --no-cache gcc g++ libffi-dev openssl-dev libxml2-dev libxslt-dev build-base musl-dev &&\ + pip install -U pip &&\ pip install --timeout 30 --user --no-cache-dir --no-warn-script-location -r requirements.txt FROM python:3.7-alpine diff --git a/README.md b/README.md index 7897c7b2..3e7fb011 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,8 @@ cd ProxyPool 安装方法自行搜索即可。 +官方 Docker Hub 镜像:[germey/proxypool](https://hub.docker.com/r/germey/proxypool) + ### 常规方式 常规方式要求有 Python 环境、Redis 环境,具体要求如下: @@ -100,25 +102,20 @@ docker-compose -f build.yml up 设置 host、port、password,如果 password 为空可以设置为空字符串,示例如下: ```shell script -export REDIS_HOST='localhost' -export REDIS_PORT=6379 -export REDIS_PASSWORD='' -export REDIS_DB=0 +export PROXYPOOL_REDIS_HOST='localhost' +export PROXYPOOL_REDIS_PORT=6379 +export PROXYPOOL_REDIS_PASSWORD='' +export PROXYPOOL_REDIS_DB=0 ``` 或者只设置连接字符串: ```shell script -export REDIS_CONNECTION_STRING='redis://[password]@host:port/db' -``` - -如果没有密码也要设置为: - -```shell script -export REDIS_CONNECTION_STRING='redis://@host:port/db' +export PROXYPOOL_REDIS_CONNECTION_STRING='redis://localhost' ``` -这里连接字符串的格式需要符合 `redis://[password]@host:port/db` 的格式,注意不要遗漏 `@`。 +这里连接字符串的格式需要符合 `redis://[:password@]host[:port][/database]` 的格式, +中括号参数可以省略,port默认是6379,database默认是0,密码默认为空。 以上两种设置任选其一即可。 @@ -233,6 +230,8 @@ get random proxy 116.196.115.209:8080 - APP_ENV:运行环境,可以设置 dev、test、prod,即开发、测试、生产环境,默认 dev - APP_DEBUG:调试模式,可以设置 true 或 false,默认 true +- APP_PROD_METHOD: 正式环境启动应用方式,默认是`gevent`, + 可选:`tornado`,`meinheld`(分别需要安装tornado或meinheld模块) ### Redis 连接 diff --git a/proxypool/processors/server.py b/proxypool/processors/server.py index d3edd70d..f7138c64 100644 --- a/proxypool/processors/server.py +++ b/proxypool/processors/server.py @@ -1,11 +1,13 @@ from flask import Flask, g from proxypool.storages.redis import RedisClient -from proxypool.setting import API_HOST, API_PORT, API_THREADED +from proxypool.setting import API_HOST, API_PORT, API_THREADED, IS_DEV __all__ = ['app'] app = Flask(__name__) +if IS_DEV: + app.debug = True def get_conn(): @@ -46,8 +48,9 @@ def get_proxy_all(): conn = get_conn() proxies = conn.all() proxies_string = '' - for proxy in proxies: - proxies_string += str(proxy) + '\n' + if proxies: + for proxy in proxies: + proxies_string += str(proxy) + '\n' return proxies_string diff --git a/proxypool/scheduler.py b/proxypool/scheduler.py index 46a6b6d0..16c3c47a 100644 --- a/proxypool/scheduler.py +++ b/proxypool/scheduler.py @@ -3,7 +3,8 @@ from proxypool.processors.server import app from proxypool.processors.getter import Getter from proxypool.processors.tester import Tester -from proxypool.setting import CYCLE_GETTER, CYCLE_TESTER, API_HOST, API_THREADED, API_PORT, ENABLE_SERVER, \ +from proxypool.setting import CYCLE_GETTER, CYCLE_TESTER, API_HOST, \ + API_THREADED, API_PORT, ENABLE_SERVER, IS_PROD, APP_PROD_METHOD, \ ENABLE_GETTER, ENABLE_TESTER, IS_WINDOWS from loguru import logger @@ -56,7 +57,42 @@ def run_server(self): if not ENABLE_SERVER: logger.info('server not enabled, exit') return - app.run(host=API_HOST, port=API_PORT, threaded=API_THREADED) + if IS_PROD: + if APP_PROD_METHOD == 'gevent': + try: + from gevent.pywsgi import WSGIServer + except ImportError as e: + logger.exception(e) + else: + http_server = WSGIServer((API_HOST, API_PORT), app) + http_server.serve_forever() + + elif APP_PROD_METHOD == 'tornado': + try: + from tornado.wsgi import WSGIContainer + from tornado.httpserver import HTTPServer + from tornado.ioloop import IOLoop + except ImportError as e: + logger.exception(e) + else: + http_server = HTTPServer(WSGIContainer(app)) + http_server.listen(API_PORT) + IOLoop.instance().start() + + elif APP_PROD_METHOD == "meinheld": + try: + import meinheld + except ImportError as e: + logger.exception(e) + else: + meinheld.listen((API_HOST, API_PORT)) + meinheld.run(app) + + else: + logger.error("unsupported APP_PROD_METHOD") + return + else: + app.run(host=API_HOST, port=API_PORT, threaded=API_THREADED) def run(self): global tester_process, getter_process, server_process @@ -71,13 +107,13 @@ def run(self): if ENABLE_GETTER: getter_process = multiprocessing.Process( target=self.run_getter) - logger.info(f'starting getter, pid{getter_process.pid}...') + logger.info(f'starting getter, pid {getter_process.pid}...') getter_process.start() if ENABLE_SERVER: server_process = multiprocessing.Process( target=self.run_server) - logger.info(f'starting server, pid{server_process.pid}...') + logger.info(f'starting server, pid {server_process.pid}...') server_process.start() tester_process and tester_process.join() diff --git a/proxypool/setting.py b/proxypool/setting.py index d11771f4..ba8b3895 100644 --- a/proxypool/setting.py +++ b/proxypool/setting.py @@ -22,6 +22,12 @@ APP_PROD = IS_PROD = APP_ENV == PROD_MODE APP_TEST = IS_TEST = APP_ENV == TEST_MODE +# Which WSGI container is used to run applications +# - gevent: pip install gevent +# - tornado: pip install tornado +# - meinheld: pip install meinheld +APP_PROD_METHOD = env.str('APP_PROD_METHOD', "gevent").lower() + # redis host REDIS_HOST = env.str('PROXYPOOL_REDIS_HOST', env.str('REDIS_HOST', '127.0.0.1')) diff --git a/requirements.txt b/requirements.txt index aa5eddcc..3c797aed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,6 @@ redis==3.5.3 lxml==4.6.5 fake_headers==1.0.2 maxminddb_geolite2==2018.703 +gevent>=21.1.0 +tornado>=6.0 +meinheld>=1.0.0 \ No newline at end of file From 20d16f97b428364649de2e9036d5f235f771f2e5 Mon Sep 17 00:00:00 2001 From: Germey <qicu@microsoft.com> Date: Fri, 31 Dec 2021 11:20:32 +0800 Subject: [PATCH 41/65] fix --- proxypool/scheduler.py | 8 ++++---- proxypool/setting.py | 5 ++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/proxypool/scheduler.py b/proxypool/scheduler.py index 16c3c47a..f29b5655 100644 --- a/proxypool/scheduler.py +++ b/proxypool/scheduler.py @@ -3,7 +3,7 @@ from proxypool.processors.server import app from proxypool.processors.getter import Getter from proxypool.processors.tester import Tester -from proxypool.setting import CYCLE_GETTER, CYCLE_TESTER, API_HOST, \ +from proxypool.setting import APP_PROD_METHOD_GEVENT, APP_PROD_METHOD_MEINHELD, APP_PROD_METHOD_TORNADO, CYCLE_GETTER, CYCLE_TESTER, API_HOST, \ API_THREADED, API_PORT, ENABLE_SERVER, IS_PROD, APP_PROD_METHOD, \ ENABLE_GETTER, ENABLE_TESTER, IS_WINDOWS from loguru import logger @@ -58,7 +58,7 @@ def run_server(self): logger.info('server not enabled, exit') return if IS_PROD: - if APP_PROD_METHOD == 'gevent': + if APP_PROD_METHOD == APP_PROD_METHOD_GEVENT: try: from gevent.pywsgi import WSGIServer except ImportError as e: @@ -67,7 +67,7 @@ def run_server(self): http_server = WSGIServer((API_HOST, API_PORT), app) http_server.serve_forever() - elif APP_PROD_METHOD == 'tornado': + elif APP_PROD_METHOD == APP_PROD_METHOD_TORNADO: try: from tornado.wsgi import WSGIContainer from tornado.httpserver import HTTPServer @@ -79,7 +79,7 @@ def run_server(self): http_server.listen(API_PORT) IOLoop.instance().start() - elif APP_PROD_METHOD == "meinheld": + elif APP_PROD_METHOD == APP_PROD_METHOD_MEINHELD: try: import meinheld except ImportError as e: diff --git a/proxypool/setting.py b/proxypool/setting.py index ba8b3895..b00bc888 100644 --- a/proxypool/setting.py +++ b/proxypool/setting.py @@ -26,7 +26,10 @@ # - gevent: pip install gevent # - tornado: pip install tornado # - meinheld: pip install meinheld -APP_PROD_METHOD = env.str('APP_PROD_METHOD', "gevent").lower() +APP_PROD_METHOD_GEVENT = 'gevent' +APP_PROD_METHOD_TORNADO = 'tornado' +APP_PROD_METHOD_MEINHELD = 'meinheld' +APP_PROD_METHOD = env.str('APP_PROD_METHOD', APP_PROD_METHOD_GEVENT).lower() # redis host REDIS_HOST = env.str('PROXYPOOL_REDIS_HOST', From 34d5a4389ac75d50fbc07903d3080ec69b6529db Mon Sep 17 00:00:00 2001 From: Germey <qicu@microsoft.com> Date: Fri, 31 Dec 2021 11:34:55 +0800 Subject: [PATCH 42/65] update decode --- proxypool/storages/redis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proxypool/storages/redis.py b/proxypool/storages/redis.py index 5ae145bc..8ab0e41d 100644 --- a/proxypool/storages/redis.py +++ b/proxypool/storages/redis.py @@ -29,7 +29,7 @@ def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db """ # if set connection_string, just use it if connection_string: - self.db = redis.StrictRedis.from_url(connection_string) + self.db = redis.StrictRedis.from_url(connection_string, decode_responses=True, **kwargs) else: self.db = redis.StrictRedis( host=host, port=port, password=password, db=db, decode_responses=True, **kwargs) From b1dd8b2cec27880c0f9b36569adc3eab6ae82185 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=94=E5=BA=86=E6=89=8D=E4=B8=A8=E9=9D=99=E8=A7=85?= <cqc@cuiqingcai.com> Date: Fri, 31 Dec 2021 15:28:12 +0800 Subject: [PATCH 43/65] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3e7fb011..00015c60 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ proxypool | 2020-02-19 17:09:46,596 INFO success: tester entered RUNNING stat 当然你也可以选择自己 Build,直接运行如下命令即可: ``` -docker-compose -f build.yml up +docker-compose -f build.yaml up ``` 如果下载速度特别慢,可以自行修改 Dockerfile,修改: From be7e3edf0028211c6e7e49d20ce9a9650d9b1d1d Mon Sep 17 00:00:00 2001 From: Germey <qicu@microsoft.com> Date: Sun, 2 Jan 2022 17:16:37 +0800 Subject: [PATCH 44/65] add port --- kubernetes/values.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kubernetes/values.yaml b/kubernetes/values.yaml index 1a7421d1..15b25377 100644 --- a/kubernetes/values.yaml +++ b/kubernetes/values.yaml @@ -16,8 +16,10 @@ deployment: memory: "200Mi" cpu: "80m" env: - - name: REDIS_HOST + - name: PROXYPOOL_REDIS_HOST value: "proxypool-redis" + - name: PROXYPOOL_REDIS_PORT + value: "6379" service: type: ClusterIP From feb5d354ff60f121848ed1e1bbfbedb49ef65583 Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Sun, 27 Feb 2022 14:54:56 +0800 Subject: [PATCH 45/65] upgrade aiohttp --- proxypool/setting.py | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/proxypool/setting.py b/proxypool/setting.py index b00bc888..d3f65da9 100644 --- a/proxypool/setting.py +++ b/proxypool/setting.py @@ -70,7 +70,7 @@ TEST_TIMEOUT = env.int('TEST_TIMEOUT', 10) TEST_BATCH = env.int('TEST_BATCH', 20) # only save anonymous proxy -TEST_ANONYMOUS = True +TEST_ANONYMOUS = env.bool('TEST_ANONYMOUS', True) # TEST_HEADERS = env.json('TEST_HEADERS', { # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', # }) diff --git a/requirements.txt b/requirements.txt index 3c797aed..f25c61d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ environs==9.3.0 Flask==1.1.2 attrs==20.3.0 retrying==1.3.3 -aiohttp==3.7.4 +aiohttp==3.8.1 requests==2.25.1 loguru==0.5.3 pyquery==1.4.3 From 23a3b2bce2f328182713bf795632cee93b1403be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=94=E5=BA=86=E6=89=8D=E4=B8=A8=E9=9D=99=E8=A7=85?= <cqc@cuiqingcai.com> Date: Fri, 4 Mar 2022 12:02:47 +0800 Subject: [PATCH 46/65] Update requirements.txt --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f25c61d7..81757ad9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,5 @@ fake_headers==1.0.2 maxminddb_geolite2==2018.703 gevent>=21.1.0 tornado>=6.0 -meinheld>=1.0.0 \ No newline at end of file +meinheld>=1.0.0 +itsdangerous==0.24 From c683f8265efd6036337541ab70770c4046455e5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=94=E5=BA=86=E6=89=8D=E4=B8=A8=E9=9D=99=E8=A7=85?= <cqc@cuiqingcai.com> Date: Sun, 6 Mar 2022 12:09:17 +0800 Subject: [PATCH 47/65] fix (#146) --- proxypool/setting.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/proxypool/setting.py b/proxypool/setting.py index d3f65da9..4fb80fea 100644 --- a/proxypool/setting.py +++ b/proxypool/setting.py @@ -22,6 +22,7 @@ APP_PROD = IS_PROD = APP_ENV == PROD_MODE APP_TEST = IS_TEST = APP_ENV == TEST_MODE + # Which WSGI container is used to run applications # - gevent: pip install gevent # - tornado: pip install tornado @@ -86,5 +87,24 @@ ENABLE_GETTER = env.bool('ENABLE_GETTER', True) ENABLE_SERVER = env.bool('ENABLE_SERVER', True) -# logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), level='DEBUG', rotation='1 week', retention='20 days') -# logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), level='ERROR', rotation='1 week') + +ENABLE_LOG_FILE = env.bool('ENABLE_LOG_FILE', True) +ENABLE_LOG_RUNTIME_FILE = env.bool('ENABLE_LOG_RUNTIME_FILE', True) +ENABLE_LOG_ERROR_FILE = env.bool('ENABLE_LOG_ERROR_FILE', True) + + +LOG_LEVEL_MAP = { + DEV_MODE: "DEBUG", + TEST_MODE: "INFO", + PROD_MODE: "ERROR" +} + +LOG_LEVEL = LOG_LEVEL_MAP.get(APP_ENV) + +if ENABLE_LOG_FILE: + if ENABLE_LOG_RUNTIME_FILE: + logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), + level=LOG_LEVEL, rotation='1 week', retention='20 days') + if ENABLE_LOG_ERROR_FILE: + logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), + level='ERROR', rotation='1 week') From bd6c2fdcb3b0dc9aeb7547d50bdadb6898492ea3 Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Sun, 6 Mar 2022 12:09:55 +0800 Subject: [PATCH 48/65] update version --- requirements.txt | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/requirements.txt b/requirements.txt index 81757ad9..75ffa829 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,17 @@ -environs==9.3.0 -Flask==1.1.2 -attrs==20.3.0 -retrying==1.3.3 -aiohttp==3.8.1 -requests==2.25.1 -loguru==0.5.3 -pyquery==1.4.3 -supervisor==4.2.1 -redis==3.5.3 -lxml==4.6.5 -fake_headers==1.0.2 +environs>=9.3.0,<10.0.0 +Flask>=1.1.2,<2.0.0 +attrs>=20.3.0,<21.0.0 +retrying>=1.3.3,<2.0.0 +aiohttp>=3.8.1,<4.0.0 +requests>=2.25.1,<3.0.0 +loguru>=0.5.3,<1.0.0 +pyquery>=1.4.3,<2.0.0 +supervisor>=4.2.1,<5.0.0 +redis>=3.5.3,<4.0.0 +lxml>=4.6.5,<5.0.0 +fake_headers>=1.0.2,<2.0.0 maxminddb_geolite2==2018.703 -gevent>=21.1.0 -tornado>=6.0 -meinheld>=1.0.0 -itsdangerous==0.24 +gevent>=21.1.0,<22.0.0 +tornado>=6.0,<7.0 +meinheld>=1.0.0,<2.0.0 +itsdangerous>=0.24,<1.0.0 From 8cdab49e417c0593adfac9fa8567572714a74b3c Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Sun, 6 Mar 2022 12:12:40 +0800 Subject: [PATCH 49/65] update readme --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 00015c60..8389e50d 100644 --- a/README.md +++ b/README.md @@ -226,6 +226,7 @@ get random proxy 116.196.115.209:8080 - ENABLE_GETTER:允许 Getter 启动,默认 true - ENABLE_SERVER:运行 Server 启动,默认 true + ### 环境 - APP_ENV:运行环境,可以设置 dev、test、prod,即开发、测试、生产环境,默认 dev @@ -259,6 +260,9 @@ get random proxy 116.196.115.209:8080 - LOG_DIR:日志相对路径 - LOG_RUNTIME_FILE:运行日志文件名称 - LOG_ERROR_FILE:错误日志文件名称 +- ENABLE_LOG_FILE:是否输出 log 文件,默认 true,如果设置为 false,那么 ENABLE_LOG_RUNTIME_FILE 和 ENABLE_LOG_ERROR_FILE 都不会生效 +- ENABLE_LOG_RUNTIME_FILE:是否输出 runtime log 文件,默认 true +- ENABLE_LOG_ERROR_FILE:是否输出 error log 文件,默认 true 以上内容均可使用环境变量配置,即在运行前设置对应环境变量值即可,如更改测试地址和 Redis 键名: From 1615d5de7ca4c5e263c6b5aa7200cf6191aee994 Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Sun, 6 Mar 2022 12:25:24 +0800 Subject: [PATCH 50/65] remove port --- docker-compose.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 0b369788..782f5477 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,8 +3,8 @@ services: redis4proxypool: image: redis:alpine container_name: redis4proxypool - ports: - - "6374:6379" + # ports: + # - "6374:6379" proxypool: image: "germey/proxypool:master" container_name: proxypool From d7a0bcef7e63ae57a99ad094929a5889bb3168bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=94=E5=BA=86=E6=89=8D=E4=B8=A8=E9=9D=99=E8=A7=85?= <cqc@cuiqingcai.com> Date: Thu, 24 Mar 2022 11:26:20 +0800 Subject: [PATCH 51/65] Update requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 75ffa829..38011966 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,4 @@ gevent>=21.1.0,<22.0.0 tornado>=6.0,<7.0 meinheld>=1.0.0,<2.0.0 itsdangerous>=0.24,<1.0.0 +MarkupSafe<2.1.0 From ebb0bc6fe1b59f846751bb63bea0fba308178b21 Mon Sep 17 00:00:00 2001 From: Weltolk <40228052+Weltolk@users.noreply.github.com> Date: Sun, 3 Apr 2022 10:23:32 +0800 Subject: [PATCH 52/65] Update README.md (#153) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 错了一个字 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8389e50d..0a44eb42 100644 --- a/README.md +++ b/README.md @@ -250,7 +250,7 @@ get random proxy 116.196.115.209:8080 - TEST_URL:测试 URL,默认百度 - TEST_TIMEOUT:测试超时时间,默认 10 秒 - TEST_BATCH:批量测试数量,默认 20 个代理 -- TEST_VALID_STATUS:测试有效的状态吗 +- TEST_VALID_STATUS:测试有效的状态码 - API_HOST:代理 Server 运行 Host,默认 0.0.0.0 - API_PORT:代理 Server 运行端口,默认 5555 - API_THREADED:代理 Server 是否使用多线程,默认 true From 0ebac358a2644ab93fbf078802e639f5c5c15a25 Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Wed, 6 Apr 2022 22:16:35 +0800 Subject: [PATCH 53/65] fix --- proxypool/crawlers/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/proxypool/crawlers/base.py b/proxypool/crawlers/base.py index 4a4bf5cd..611a816f 100644 --- a/proxypool/crawlers/base.py +++ b/proxypool/crawlers/base.py @@ -20,7 +20,7 @@ def fetch(self, url, **kwargs): if response.status_code == 200: response.encoding = 'utf-8' return response.text - except requests.ConnectionError: + except (requests.ConnectionError, requests.ReadTimeout): return def process(self, html, url): @@ -39,6 +39,8 @@ def crawl(self): for url in self.urls: logger.info(f'fetching {url}') html = self.fetch(url) + if not html: + continue time.sleep(.5) yield from self.process(html, url) except RetryError: From 2154cefc9a8ccd4a1a31edab8dbe09cff2350474 Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Mon, 11 Jul 2022 00:33:26 +0800 Subject: [PATCH 54/65] add log settings --- README.md | 7 ++++--- docker-compose.yml | 1 + proxypool/setting.py | 9 +++++++-- requirements.txt | 2 +- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0a44eb42..256f1155 100644 --- a/README.md +++ b/README.md @@ -115,7 +115,7 @@ export PROXYPOOL_REDIS_CONNECTION_STRING='redis://localhost' ``` 这里连接字符串的格式需要符合 `redis://[:password@]host[:port][/database]` 的格式, -中括号参数可以省略,port默认是6379,database默认是0,密码默认为空。 +中括号参数可以省略,port 默认是 6379,database 默认是 0,密码默认为空。 以上两种设置任选其一即可。 @@ -226,13 +226,12 @@ get random proxy 116.196.115.209:8080 - ENABLE_GETTER:允许 Getter 启动,默认 true - ENABLE_SERVER:运行 Server 启动,默认 true - ### 环境 - APP_ENV:运行环境,可以设置 dev、test、prod,即开发、测试、生产环境,默认 dev - APP_DEBUG:调试模式,可以设置 true 或 false,默认 true - APP_PROD_METHOD: 正式环境启动应用方式,默认是`gevent`, - 可选:`tornado`,`meinheld`(分别需要安装tornado或meinheld模块) + 可选:`tornado`,`meinheld`(分别需要安装 tornado 或 meinheld 模块) ### Redis 连接 @@ -260,6 +259,8 @@ get random proxy 116.196.115.209:8080 - LOG_DIR:日志相对路径 - LOG_RUNTIME_FILE:运行日志文件名称 - LOG_ERROR_FILE:错误日志文件名称 +- LOG_ROTATION: 日志记录周转周期或大小,见 [loguru - rotation](https://github.com/Delgan/loguru#easier-file-logging-with-rotation--retention--compression) +- LOG_RETENTION: 日志保留日期,见 [loguru - retention](https://github.com/Delgan/loguru#easier-file-logging-with-rotation--retention--compression) - ENABLE_LOG_FILE:是否输出 log 文件,默认 true,如果设置为 false,那么 ENABLE_LOG_RUNTIME_FILE 和 ENABLE_LOG_ERROR_FILE 都不会生效 - ENABLE_LOG_RUNTIME_FILE:是否输出 runtime log 文件,默认 true - ENABLE_LOG_ERROR_FILE:是否输出 error log 文件,默认 true diff --git a/docker-compose.yml b/docker-compose.yml index 782f5477..cf367f42 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,3 +15,4 @@ services: # - proxypool/crawlers/private:/app/proxypool/crawlers/private environment: PROXYPOOL_REDIS_HOST: redis4proxypool + diff --git a/proxypool/setting.py b/proxypool/setting.py index 4fb80fea..e68f45ca 100644 --- a/proxypool/setting.py +++ b/proxypool/setting.py @@ -2,6 +2,7 @@ from os.path import dirname, abspath, join from environs import Env from loguru import logger +import shutil env = Env() @@ -100,11 +101,15 @@ } LOG_LEVEL = LOG_LEVEL_MAP.get(APP_ENV) +LOG_ROTATION = env.str('LOG_ROTATION', '500MB') +LOG_RETENTION = env.str('LOG_RETENTION', '1 week') if ENABLE_LOG_FILE: if ENABLE_LOG_RUNTIME_FILE: logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), - level=LOG_LEVEL, rotation='1 week', retention='20 days') + level=LOG_LEVEL, rotation=LOG_ROTATION, retention=LOG_RETENTION) if ENABLE_LOG_ERROR_FILE: logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), - level='ERROR', rotation='1 week') + level='ERROR', rotation=LOG_ROTATION) +else: + shutil.rmtree(LOG_DIR, ignore_errors=True) diff --git a/requirements.txt b/requirements.txt index 38011966..4e7d5388 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,5 +14,5 @@ maxminddb_geolite2==2018.703 gevent>=21.1.0,<22.0.0 tornado>=6.0,<7.0 meinheld>=1.0.0,<2.0.0 -itsdangerous>=0.24,<1.0.0 +itsdangerous==0.24 MarkupSafe<2.1.0 From 78325d0ea06b026123a45abe31611ffab18493ab Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Mon, 11 Jul 2022 00:36:03 +0800 Subject: [PATCH 55/65] add readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 256f1155..d01d7a08 100644 --- a/README.md +++ b/README.md @@ -259,8 +259,8 @@ get random proxy 116.196.115.209:8080 - LOG_DIR:日志相对路径 - LOG_RUNTIME_FILE:运行日志文件名称 - LOG_ERROR_FILE:错误日志文件名称 -- LOG_ROTATION: 日志记录周转周期或大小,见 [loguru - rotation](https://github.com/Delgan/loguru#easier-file-logging-with-rotation--retention--compression) -- LOG_RETENTION: 日志保留日期,见 [loguru - retention](https://github.com/Delgan/loguru#easier-file-logging-with-rotation--retention--compression) +- LOG_ROTATION: 日志记录周转周期或大小,默认 500MB,见 [loguru - rotation](https://github.com/Delgan/loguru#easier-file-logging-with-rotation--retention--compression) +- LOG_RETENTION: 日志保留日期,默认 7 天,见 [loguru - retention](https://github.com/Delgan/loguru#easier-file-logging-with-rotation--retention--compression) - ENABLE_LOG_FILE:是否输出 log 文件,默认 true,如果设置为 false,那么 ENABLE_LOG_RUNTIME_FILE 和 ENABLE_LOG_ERROR_FILE 都不会生效 - ENABLE_LOG_RUNTIME_FILE:是否输出 runtime log 文件,默认 true - ENABLE_LOG_ERROR_FILE:是否输出 error log 文件,默认 true From 4c50711dde75e33de813acccab1042d5f95487de Mon Sep 17 00:00:00 2001 From: Takayama <49364055+MGMCN@users.noreply.github.com> Date: Wed, 1 Mar 2023 10:55:52 +0900 Subject: [PATCH 56/65] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BA=86=20geonodedail?= =?UTF-8?q?i.py=20=E7=88=AC=E5=8F=96=E4=BB=A3=E7=90=86=20(#186)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add geonodedaili.py * add headers through crawl function --- proxypool/crawlers/public/geonodedaili.py | 71 +++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 proxypool/crawlers/public/geonodedaili.py diff --git a/proxypool/crawlers/public/geonodedaili.py b/proxypool/crawlers/public/geonodedaili.py new file mode 100644 index 00000000..f71f16ec --- /dev/null +++ b/proxypool/crawlers/public/geonodedaili.py @@ -0,0 +1,71 @@ +import time +from retrying import RetryError +from loguru import logger +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import json + +BASE_URL = 'https://proxylist.geonode.com/api/proxy-list?limit=500&page={page}&sort_by=lastChecked&sort_type=desc' +MAX_PAGE = 18 + + +class GeonodeCrawler(BaseCrawler): + """ + Geonode crawler, https://proxylist.geonode.com/ + """ + urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + try: + result = json.loads(html) + proxy_list = result['data'] + for proxy_item in proxy_list: + host = proxy_item['ip'] + port = proxy_item['port'] + yield Proxy(host=host, port=port) + except json.JSONDecodeError: + print("json.JSONDecodeError") + return + + def crawl(self): + """ + override crawl main method + add headers + """ + headers = { + 'authority': 'proxylist.geonode.com', + 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"', + 'accept': 'application/json, text/plain, */*', + 'sec-ch-ua-mobile': '?0', + 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36', + 'sec-ch-ua-platform': '"macOS"', + 'origin': 'https://geonode.com', + 'sec-fetch-site': 'same-site', + 'sec-fetch-mode': 'cors', + 'sec-fetch-dest': 'empty', + 'referer': 'https://geonode.com/', + 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7', + 'if-none-match': 'W/"c25d-BXjLTmP+/yYXtIz4OEcmdOWSv88"', + } + try: + for url in self.urls: + logger.info(f'fetching {url}') + html = self.fetch(url, headers=headers) + if not html: + continue + time.sleep(.5) + yield from self.process(html, url) + except RetryError: + logger.error( + f'crawler {self} crawled proxy unsuccessfully, ' + 'please check if target url is valid or network issue') + + +if __name__ == '__main__': + crawler = GeonodeCrawler() + for proxy in crawler.crawl(): + print(proxy) From 0fd5e6cf025848d4ef1c0f157b1ba92aa38dea71 Mon Sep 17 00:00:00 2001 From: Takayama <49364055+MGMCN@users.noreply.github.com> Date: Wed, 1 Mar 2023 10:56:58 +0900 Subject: [PATCH 57/65] requirements.txt modified (#187) --- requirements.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4e7d5388..c9407c7d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,8 +11,7 @@ redis>=3.5.3,<4.0.0 lxml>=4.6.5,<5.0.0 fake_headers>=1.0.2,<2.0.0 maxminddb_geolite2==2018.703 -gevent>=21.1.0,<22.0.0 +gevent>=21.8.0,<22.0.0 tornado>=6.0,<7.0 -meinheld>=1.0.0,<2.0.0 itsdangerous==0.24 MarkupSafe<2.1.0 From a0edcb0c4ce274b22ac0a8dcc7ea54225bcfdd4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AE=B8=E5=A4=A7=E5=B8=85Aiden?= <38374506+dashuaixu@users.noreply.github.com> Date: Wed, 30 Aug 2023 21:49:30 +0800 Subject: [PATCH 58/65] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E7=A7=81=E4=BA=BA?= =?UTF-8?q?=E4=BB=A3=E7=90=86=E6=B1=A0=E5=9C=BA=E6=99=AF=E4=B8=AD=EF=BC=8C?= =?UTF-8?q?=E5=B8=A6=E8=BA=AB=E4=BB=BD=E8=AE=A4=E8=AF=81=E7=9A=84=E4=BB=A3?= =?UTF-8?q?=E7=90=86=E7=9A=84=E6=94=AF=E6=8C=81=20(#198)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 增加私人代理池场景中,带身份认证的代理的支持 * 修复忘记修改部分 --------- Co-authored-by: dashuai xu <xujianghuai@126.com> --- proxypool/utils/proxy.py | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/proxypool/utils/proxy.py b/proxypool/utils/proxy.py index 294033fc..79cc27fb 100644 --- a/proxypool/utils/proxy.py +++ b/proxypool/utils/proxy.py @@ -5,7 +5,10 @@ def is_valid_proxy(data): """ check this string is within proxy format """ - if data.__contains__(':'): + if is_auth_proxy(data): + host, port = extract_auth_proxy(data) + return is_ip_valid(host) and is_port_valid(port) + elif data.__contains__(':'): ip = data.split(':')[0] port = data.split(':')[1] return is_ip_valid(ip) and is_port_valid(port) @@ -17,6 +20,8 @@ def is_ip_valid(ip): """ check this string is within ip format """ + if is_auth_proxy(ip): + ip = ip.split('@')[1] a = ip.split('.') if len(a) != 4: return False @@ -48,9 +53,36 @@ def convert_proxy_or_proxies(data): # skip invalid item item = item.strip() if not is_valid_proxy(item): continue - host, port = item.split(':') + if is_auth_proxy(item): + host, port = extract_auth_proxy(item) + else: + host, port = item.split(':') result.append(Proxy(host=host, port=int(port))) return result if isinstance(data, str) and is_valid_proxy(data): - host, port = data.split(':') + if is_auth_proxy(data): + host, port = extract_auth_proxy(data) + else: + host, port = data.split(':') return Proxy(host=host, port=int(port)) + + +def is_auth_proxy(data: str) -> bool: + return '@' in data + + +def extract_auth_proxy(data: str) -> (str, str): + """ + extract host and port from a proxy with authentication + """ + auth = data.split('@')[0] + ip_port = data.split('@')[1] + ip = ip_port.split(':')[0] + port = ip_port.split(':')[1] + host = auth + '@' + ip + return host, port + + +if __name__ == '__main__': + proxy = 'test1234:test5678.@117.68.216.212:32425' + print(extract_auth_proxy(proxy)) From 0a6d8610c135ae10748aff6f1164d4b911286bf6 Mon Sep 17 00:00:00 2001 From: inVains <2282229+inVains@users.noreply.github.com> Date: Thu, 23 Nov 2023 11:59:24 +0800 Subject: [PATCH 59/65] =?UTF-8?q?#140=20=E5=8F=8A=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E5=8F=AF=E7=8E=AF=E5=A2=83=E5=8F=98=E9=87=8F=E9=85=8D=E7=BD=AE?= =?UTF-8?q?=E7=9A=84=E5=8F=82=E6=95=B0=20(#202)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 1、设置scheduler.py中dev模式的flask run,不进行启动时的自动重新加载reload * 1、修改PROXY_SCORE_MAX,PROXY_SCORE_MIN,PROXY_SCORE_INIT三项配置,为可环境变量配置 2、添加可环境变量配置项TEST_DONT_SET_MAX_SCORE,允许设置当tester检测到某个proxy可用时,只是保持原score,而不将其score设置成max。 * 增加获取proxy接口的认证header, API-KEY。可配置,默认不需要 --- proxypool/processors/server.py | 29 ++++++++++++++++++++++++++--- proxypool/processors/tester.py | 20 ++++++++++++-------- proxypool/scheduler.py | 2 +- proxypool/setting.py | 12 +++++++++--- 4 files changed, 48 insertions(+), 15 deletions(-) diff --git a/proxypool/processors/server.py b/proxypool/processors/server.py index f7138c64..aa0500ac 100644 --- a/proxypool/processors/server.py +++ b/proxypool/processors/server.py @@ -1,7 +1,7 @@ -from flask import Flask, g +from flask import Flask, g, request from proxypool.storages.redis import RedisClient -from proxypool.setting import API_HOST, API_PORT, API_THREADED, IS_DEV - +from proxypool.setting import API_HOST, API_PORT, API_THREADED, API_KEY, IS_DEV +import functools __all__ = ['app'] @@ -10,6 +10,25 @@ app.debug = True +def auth_required(func): + @functools.wraps(func) + def decorator(*args, **kwargs): + # conditional decorator, when setting API_KEY is set, otherwise just ignore this decorator + if API_KEY == "": + return func(*args, **kwargs) + if request.headers.get('API-KEY', None) is not None: + api_key = request.headers.get('API-KEY') + else: + return {"message": "Please provide an API key in header"}, 400 + # Check if API key is correct and valid + if request.method == "GET" and api_key == API_KEY: + return func(*args, **kwargs) + else: + return {"message": "The provided API key is not valid"}, 403 + + return decorator + + def get_conn(): """ get redis client object @@ -21,6 +40,7 @@ def get_conn(): @app.route('/') +@auth_required def index(): """ get home page, you can define your own templates @@ -30,6 +50,7 @@ def index(): @app.route('/random') +@auth_required def get_proxy(): """ get a random proxy @@ -40,6 +61,7 @@ def get_proxy(): @app.route('/all') +@auth_required def get_proxy_all(): """ get a random proxy @@ -56,6 +78,7 @@ def get_proxy_all(): @app.route('/count') +@auth_required def get_count(): """ get the count of proxies diff --git a/proxypool/processors/tester.py b/proxypool/processors/tester.py index f002056a..353332ac 100644 --- a/proxypool/processors/tester.py +++ b/proxypool/processors/tester.py @@ -3,11 +3,11 @@ from loguru import logger from proxypool.schemas import Proxy from proxypool.storages.redis import RedisClient -from proxypool.setting import TEST_TIMEOUT, TEST_BATCH, TEST_URL, TEST_VALID_STATUS, TEST_ANONYMOUS +from proxypool.setting import TEST_TIMEOUT, TEST_BATCH, TEST_URL, TEST_VALID_STATUS, TEST_ANONYMOUS, \ + TEST_DONT_SET_MAX_SCORE from aiohttp import ClientProxyConnectionError, ServerDisconnectedError, ClientOSError, ClientHttpProxyError from asyncio import TimeoutError - EXCEPTIONS = ( ClientProxyConnectionError, ConnectionRefusedError, @@ -23,14 +23,14 @@ class Tester(object): """ tester for testing proxies in queue """ - + def __init__(self): """ init redis """ self.redis = RedisClient() self.loop = asyncio.get_event_loop() - + async def test(self, proxy: Proxy): """ test single proxy @@ -55,15 +55,18 @@ async def test(self, proxy: Proxy): async with session.get(TEST_URL, proxy=f'http://{proxy.string()}', timeout=TEST_TIMEOUT, allow_redirects=False) as response: if response.status in TEST_VALID_STATUS: - self.redis.max(proxy) - logger.debug(f'proxy {proxy.string()} is valid, set max score') + if TEST_DONT_SET_MAX_SCORE: + logger.debug(f'proxy {proxy.string()} is valid, remain current score') + else: + self.redis.max(proxy) + logger.debug(f'proxy {proxy.string()} is valid, set max score') else: self.redis.decrease(proxy) logger.debug(f'proxy {proxy.string()} is invalid, decrease score') except EXCEPTIONS: self.redis.decrease(proxy) logger.debug(f'proxy {proxy.string()} is invalid, decrease score') - + @logger.catch def run(self): """ @@ -84,14 +87,15 @@ def run(self): if not cursor: break + def run_tester(): host = '96.113.165.182' port = '3128' tasks = [tester.test(Proxy(host=host, port=port))] tester.loop.run_until_complete(asyncio.wait(tasks)) + if __name__ == '__main__': tester = Tester() tester.run() # run_tester() - diff --git a/proxypool/scheduler.py b/proxypool/scheduler.py index f29b5655..a2d18abe 100644 --- a/proxypool/scheduler.py +++ b/proxypool/scheduler.py @@ -92,7 +92,7 @@ def run_server(self): logger.error("unsupported APP_PROD_METHOD") return else: - app.run(host=API_HOST, port=API_PORT, threaded=API_THREADED) + app.run(host=API_HOST, port=API_PORT, threaded=API_THREADED, use_reloader=False) def run(self): global tester_process, getter_process, server_process diff --git a/proxypool/setting.py b/proxypool/setting.py index e68f45ca..495ad55f 100644 --- a/proxypool/setting.py +++ b/proxypool/setting.py @@ -53,9 +53,9 @@ 'REDIS_KEY', 'proxies:universal')) # definition of proxy scores -PROXY_SCORE_MAX = 100 -PROXY_SCORE_MIN = 0 -PROXY_SCORE_INIT = 10 +PROXY_SCORE_MAX = env.int('PROXY_SCORE_MAX', 100) +PROXY_SCORE_MIN = env.int('PROXY_SCORE_MIN', 0) +PROXY_SCORE_INIT = env.int('PROXY_SCORE_INIT', 10) # definition of proxy number PROXY_NUMBER_MAX = 50000 @@ -77,11 +77,17 @@ # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', # }) TEST_VALID_STATUS = env.list('TEST_VALID_STATUS', [200, 206, 302]) +# whether to set max score when one proxy is tested valid +TEST_DONT_SET_MAX_SCORE = env.bool('TEST_DONT_SET_MAX_SCORE', False) # definition of api API_HOST = env.str('API_HOST', '0.0.0.0') API_PORT = env.int('API_PORT', 5555) API_THREADED = env.bool('API_THREADED', True) +# add an api key to get proxy +# need a header of `API-KEY` in get request to pass the authenticate +# API_KEY='', do not need `API-KEY` header +API_KEY = env.str('API_KEY', '') # flags of enable ENABLE_TESTER = env.bool('ENABLE_TESTER', True) From 2344ee1db07e9095e00153fa41280cfea436c8a4 Mon Sep 17 00:00:00 2001 From: Shayne Wang <1614565666@qq.com> Date: Fri, 1 Dec 2023 19:05:08 +0800 Subject: [PATCH 60/65] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=B8=A4=E4=B8=AA?= =?UTF-8?q?=E4=BB=A3=E7=90=86=20(#203)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- proxypool/crawlers/public/docip.py | 38 +++++++++++++++++++++ proxypool/crawlers/public/uqidata.py | 49 ++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 proxypool/crawlers/public/docip.py create mode 100644 proxypool/crawlers/public/uqidata.py diff --git a/proxypool/crawlers/public/docip.py b/proxypool/crawlers/public/docip.py new file mode 100644 index 00000000..070c5983 --- /dev/null +++ b/proxypool/crawlers/public/docip.py @@ -0,0 +1,38 @@ +import time +from retrying import RetryError +from loguru import logger +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import json + +BASE_URL = 'https://www.docip.net/data/free.json?t={date}' + + + +class DocipCrawler(BaseCrawler): + """ + Docip crawler, https://www.docip.net/data/free.json + """ + urls = [BASE_URL.format(date=time.strftime("%Y%m%d", time.localtime()))] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + try: + result = json.loads(html) + proxy_list = result['data'] + for proxy_item in proxy_list: + host = proxy_item['ip'] + port = proxy_item['port'] + yield Proxy(host=host, port=port) + except json.JSONDecodeError: + print("json.JSONDecodeError") + return + + +if __name__ == '__main__': + crawler = DocipCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/uqidata.py b/proxypool/crawlers/public/uqidata.py new file mode 100644 index 00000000..3e54b2dc --- /dev/null +++ b/proxypool/crawlers/public/uqidata.py @@ -0,0 +1,49 @@ +from pyquery import PyQuery as pq +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +from loguru import logger + +BASE_URL = 'https://ip.uqidata.com/free/index.html' + + +class UqidataCrawler(BaseCrawler): + """ + Uqidata crawler, https://ip.uqidata.com/free/index.html + """ + urls = [BASE_URL] + ignore = True + + def encode(input_str): + tmp = [] + for i in range(len(input_str)): + tmp.append("ABCDEFGHIZ".find(input_str[i])) + result = "".join(str(i) for i in tmp) + result = int(result) >> 0x03 + return result + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + trs = doc('#main_container .inner table tbody tr:nth-child(n+3)').items() + for tr in trs: + ip_html = tr('td.ip').find("*").items() + host = '' + for i in ip_html: + if i.attr('style') is not None and 'none' in i.attr('style'): + continue + if i.text() == '': + continue + host += i.text() + + port_code = tr('td.port').attr('class').split(' ')[1] + port = UqidataCrawler.encode(port_code) + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = UqidataCrawler() + for proxy in crawler.crawl(): + print(proxy) From 97229e6f470ee0dd834e0ba4b60091ce85657b4f Mon Sep 17 00:00:00 2001 From: Dawei Feng <397615103@qq.com> Date: Wed, 27 Dec 2023 02:58:22 +0800 Subject: [PATCH 61/65] fix docip bug & add support for python 3.11 (#205) --- proxypool/crawlers/public/docip.py | 2 +- proxypool/processors/tester.py | 2 +- requirements.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/proxypool/crawlers/public/docip.py b/proxypool/crawlers/public/docip.py index 070c5983..154871fb 100644 --- a/proxypool/crawlers/public/docip.py +++ b/proxypool/crawlers/public/docip.py @@ -25,7 +25,7 @@ def parse(self, html): proxy_list = result['data'] for proxy_item in proxy_list: host = proxy_item['ip'] - port = proxy_item['port'] + port = host.split(':')[-1] yield Proxy(host=host, port=port) except json.JSONDecodeError: print("json.JSONDecodeError") diff --git a/proxypool/processors/tester.py b/proxypool/processors/tester.py index 353332ac..58795285 100644 --- a/proxypool/processors/tester.py +++ b/proxypool/processors/tester.py @@ -82,7 +82,7 @@ def run(self): logger.debug(f'testing proxies use cursor {cursor}, count {TEST_BATCH}') cursor, proxies = self.redis.batch(cursor, count=TEST_BATCH) if proxies: - tasks = [self.test(proxy) for proxy in proxies] + tasks = [self.loop.create_task(self.test(proxy)) for proxy in proxies] self.loop.run_until_complete(asyncio.wait(tasks)) if not cursor: break diff --git a/requirements.txt b/requirements.txt index c9407c7d..33f35c50 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ redis>=3.5.3,<4.0.0 lxml>=4.6.5,<5.0.0 fake_headers>=1.0.2,<2.0.0 maxminddb_geolite2==2018.703 -gevent>=21.8.0,<22.0.0 +gevent>=21.8.0,<24.0.0 tornado>=6.0,<7.0 itsdangerous==0.24 MarkupSafe<2.1.0 From 003358686720e6450867d35ff821921a819d8e80 Mon Sep 17 00:00:00 2001 From: Cesaryuan <35998162+cesaryuan@users.noreply.github.com> Date: Mon, 1 Jan 2024 22:30:20 +0800 Subject: [PATCH 62/65] fix: avoid error on "ip:port:port" format (#207) --- proxypool/utils/proxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proxypool/utils/proxy.py b/proxypool/utils/proxy.py index 79cc27fb..5330ceb4 100644 --- a/proxypool/utils/proxy.py +++ b/proxypool/utils/proxy.py @@ -56,7 +56,7 @@ def convert_proxy_or_proxies(data): if is_auth_proxy(item): host, port = extract_auth_proxy(item) else: - host, port = item.split(':') + host, port, *_ = item.split(':') result.append(Proxy(host=host, port=int(port))) return result if isinstance(data, str) and is_valid_proxy(data): From 78b324498bc3ae9370a78bb08a7b89f1f25a50e3 Mon Sep 17 00:00:00 2001 From: inVains <2282229+inVains@users.noreply.github.com> Date: Tue, 26 Mar 2024 00:36:24 +0800 Subject: [PATCH 63/65] add sub proxy pool mechanics (#213) --- proxypool/processors/getter.py | 5 +++- proxypool/processors/server.py | 21 +++++++++++--- proxypool/processors/tester.py | 28 +++++++++++++++++++ proxypool/setting.py | 2 ++ proxypool/storages/redis.py | 50 +++++++++++++++++----------------- proxypool/testers/__init__.py | 16 +++++++++++ proxypool/testers/base.py | 19 +++++++++++++ 7 files changed, 111 insertions(+), 30 deletions(-) create mode 100644 proxypool/testers/__init__.py create mode 100644 proxypool/testers/base.py diff --git a/proxypool/processors/getter.py b/proxypool/processors/getter.py index 877e198a..c5c16296 100644 --- a/proxypool/processors/getter.py +++ b/proxypool/processors/getter.py @@ -2,7 +2,7 @@ from proxypool.storages.redis import RedisClient from proxypool.setting import PROXY_NUMBER_MAX from proxypool.crawlers import __all__ as crawlers_cls - +from proxypool.testers import __all__ as testers_cls class Getter(object): """ @@ -16,6 +16,8 @@ def __init__(self): self.redis = RedisClient() self.crawlers_cls = crawlers_cls self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls] + self.testers_cls = testers_cls + self.testers = [tester_cls() for tester_cls in self.testers_cls] def is_full(self): """ @@ -36,6 +38,7 @@ def run(self): logger.info(f'crawler {crawler} to get proxy') for proxy in crawler.crawl(): self.redis.add(proxy) + [self.redis.add(proxy, redis_key=tester.key) for tester in self.testers] if __name__ == '__main__': diff --git a/proxypool/processors/server.py b/proxypool/processors/server.py index aa0500ac..50144590 100644 --- a/proxypool/processors/server.py +++ b/proxypool/processors/server.py @@ -1,6 +1,7 @@ from flask import Flask, g, request +from proxypool.exceptions import PoolEmptyException from proxypool.storages.redis import RedisClient -from proxypool.setting import API_HOST, API_PORT, API_THREADED, API_KEY, IS_DEV +from proxypool.setting import API_HOST, API_PORT, API_THREADED, API_KEY, IS_DEV, PROXY_RAND_KEY_DEGRADED import functools __all__ = ['app'] @@ -53,10 +54,19 @@ def index(): @auth_required def get_proxy(): """ - get a random proxy + get a random proxy, can query the specific sub-pool according the (redis) key + if PROXY_RAND_KEY_DEGRADED is set to True, will get a universal random proxy if no proxy found in the sub-pool :return: get a random proxy """ + key = request.args.get('key') conn = get_conn() + # return conn.random(key).string() if key else conn.random().string() + if key: + try: + return conn.random(key).string() + except PoolEmptyException: + if not PROXY_RAND_KEY_DEGRADED: + raise return conn.random().string() @@ -67,8 +77,10 @@ def get_proxy_all(): get a random proxy :return: get a random proxy """ + key = request.args.get('key') + conn = get_conn() - proxies = conn.all() + proxies = conn.all(key) if key else conn.all() proxies_string = '' if proxies: for proxy in proxies: @@ -85,7 +97,8 @@ def get_count(): :return: count, int """ conn = get_conn() - return str(conn.count()) + key = request.args.get('key') + return str(conn.count(key)) if key else conn.count() if __name__ == '__main__': diff --git a/proxypool/processors/tester.py b/proxypool/processors/tester.py index 58795285..470259a9 100644 --- a/proxypool/processors/tester.py +++ b/proxypool/processors/tester.py @@ -7,6 +7,7 @@ TEST_DONT_SET_MAX_SCORE from aiohttp import ClientProxyConnectionError, ServerDisconnectedError, ClientOSError, ClientHttpProxyError from asyncio import TimeoutError +from proxypool.testers import __all__ as testers_cls EXCEPTIONS = ( ClientProxyConnectionError, @@ -30,6 +31,8 @@ def __init__(self): """ self.redis = RedisClient() self.loop = asyncio.get_event_loop() + self.testers_cls = testers_cls + self.testers = [tester_cls() for tester_cls in self.testers_cls] async def test(self, proxy: Proxy): """ @@ -63,8 +66,33 @@ async def test(self, proxy: Proxy): else: self.redis.decrease(proxy) logger.debug(f'proxy {proxy.string()} is invalid, decrease score') + # if independent tester class found, create new set of storage and do the extra test + for tester in self.testers: + key = tester.key + if self.redis.exists(proxy, key): + test_url = tester.test_url + headers = tester.headers() + cookies = tester.cookies() + async with session.get(test_url, proxy=f'http://{proxy.string()}', + timeout=TEST_TIMEOUT, + headers=headers, + cookies=cookies, + allow_redirects=False) as response: + resp_text = await response.text() + is_valid = await tester.parse(resp_text, test_url, proxy.string()) + if is_valid: + if tester.test_dont_set_max_score: + logger.info(f'key[{key}] proxy {proxy.string()} is valid, remain current score') + else: + self.redis.max(proxy, key, tester.proxy_score_max) + logger.info(f'key[{key}] proxy {proxy.string()} is valid, set max score') + else: + self.redis.decrease(proxy, tester.key, tester.proxy_score_min) + logger.info(f'key[{key}] proxy {proxy.string()} is invalid, decrease score') + except EXCEPTIONS: self.redis.decrease(proxy) + [self.redis.decrease(proxy, tester.key, tester.proxy_score_min) for tester in self.testers] logger.debug(f'proxy {proxy.string()} is invalid, decrease score') @logger.catch diff --git a/proxypool/setting.py b/proxypool/setting.py index 495ad55f..a445667e 100644 --- a/proxypool/setting.py +++ b/proxypool/setting.py @@ -56,6 +56,8 @@ PROXY_SCORE_MAX = env.int('PROXY_SCORE_MAX', 100) PROXY_SCORE_MIN = env.int('PROXY_SCORE_MIN', 0) PROXY_SCORE_INIT = env.int('PROXY_SCORE_INIT', 10) +# whether to get a universal random proxy if no proxy exists in the sub-pool identified by a specific key +PROXY_RAND_KEY_DEGRADED = env.bool('TEST_ANONYMOUS', True) # definition of proxy number PROXY_NUMBER_MAX = 50000 diff --git a/proxypool/storages/redis.py b/proxypool/storages/redis.py index 8ab0e41d..2d052323 100644 --- a/proxypool/storages/redis.py +++ b/proxypool/storages/redis.py @@ -34,7 +34,7 @@ def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db self.db = redis.StrictRedis( host=host, port=port, password=password, db=db, decode_responses=True, **kwargs) - def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int: + def add(self, proxy: Proxy, score=PROXY_SCORE_INIT, redis_key=REDIS_KEY) -> int: """ add proxy and set it to init score :param proxy: proxy, ip:port, like 8.8.8.8:88 @@ -44,12 +44,12 @@ def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int: if not is_valid_proxy(f'{proxy.host}:{proxy.port}'): logger.info(f'invalid proxy {proxy}, throw it') return - if not self.exists(proxy): + if not self.exists(proxy, redis_key): if IS_REDIS_VERSION_2: - return self.db.zadd(REDIS_KEY, score, proxy.string()) - return self.db.zadd(REDIS_KEY, {proxy.string(): score}) + return self.db.zadd(redis_key, score, proxy.string()) + return self.db.zadd(redis_key, {proxy.string(): score}) - def random(self) -> Proxy: + def random(self, redis_key=REDIS_KEY, proxy_score_min=PROXY_SCORE_MIN, proxy_score_max=PROXY_SCORE_MAX) -> Proxy: """ get random proxy firstly try to get proxy with max score @@ -59,74 +59,74 @@ def random(self) -> Proxy: """ # try to get proxy with max score proxies = self.db.zrangebyscore( - REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MAX) + redis_key, proxy_score_max, proxy_score_max) if len(proxies): return convert_proxy_or_proxies(choice(proxies)) # else get proxy by rank proxies = self.db.zrevrange( - REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX) + redis_key, proxy_score_min, proxy_score_max) if len(proxies): return convert_proxy_or_proxies(choice(proxies)) # else raise error raise PoolEmptyException - def decrease(self, proxy: Proxy) -> int: + def decrease(self, proxy: Proxy, redis_key=REDIS_KEY, proxy_score_min=PROXY_SCORE_MIN) -> int: """ decrease score of proxy, if small than PROXY_SCORE_MIN, delete it :param proxy: proxy :return: new score """ if IS_REDIS_VERSION_2: - self.db.zincrby(REDIS_KEY, proxy.string(), -1) + self.db.zincrby(redis_key, proxy.string(), -1) else: - self.db.zincrby(REDIS_KEY, -1, proxy.string()) - score = self.db.zscore(REDIS_KEY, proxy.string()) + self.db.zincrby(redis_key, -1, proxy.string()) + score = self.db.zscore(redis_key, proxy.string()) logger.info(f'{proxy.string()} score decrease 1, current {score}') - if score <= PROXY_SCORE_MIN: + if score <= proxy_score_min: logger.info(f'{proxy.string()} current score {score}, remove') - self.db.zrem(REDIS_KEY, proxy.string()) + self.db.zrem(redis_key, proxy.string()) - def exists(self, proxy: Proxy) -> bool: + def exists(self, proxy: Proxy, redis_key=REDIS_KEY) -> bool: """ if proxy exists :param proxy: proxy :return: if exists, bool """ - return not self.db.zscore(REDIS_KEY, proxy.string()) is None + return not self.db.zscore(redis_key, proxy.string()) is None - def max(self, proxy: Proxy) -> int: + def max(self, proxy: Proxy, redis_key=REDIS_KEY, proxy_score_max=PROXY_SCORE_MAX) -> int: """ set proxy to max score :param proxy: proxy :return: new score """ - logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}') + logger.info(f'{proxy.string()} is valid, set to {proxy_score_max}') if IS_REDIS_VERSION_2: - return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string()) - return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX}) + return self.db.zadd(redis_key, proxy_score_max, proxy.string()) + return self.db.zadd(redis_key, {proxy.string(): proxy_score_max}) - def count(self) -> int: + def count(self, redis_key=REDIS_KEY) -> int: """ get count of proxies :return: count, int """ - return self.db.zcard(REDIS_KEY) + return self.db.zcard(redis_key) - def all(self) -> List[Proxy]: + def all(self, redis_key=REDIS_KEY, proxy_score_min=PROXY_SCORE_MIN, proxy_score_max=PROXY_SCORE_MAX) -> List[Proxy]: """ get all proxies :return: list of proxies """ - return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX)) + return convert_proxy_or_proxies(self.db.zrangebyscore(redis_key, proxy_score_min, proxy_score_max)) - def batch(self, cursor, count) -> List[Proxy]: + def batch(self, cursor, count, redis_key=REDIS_KEY) -> List[Proxy]: """ get batch of proxies :param cursor: scan cursor :param count: scan count :return: list of proxies """ - cursor, proxies = self.db.zscan(REDIS_KEY, cursor, count=count) + cursor, proxies = self.db.zscan(redis_key, cursor, count=count) return cursor, convert_proxy_or_proxies([i[0] for i in proxies]) diff --git a/proxypool/testers/__init__.py b/proxypool/testers/__init__.py new file mode 100644 index 00000000..4e4df95e --- /dev/null +++ b/proxypool/testers/__init__.py @@ -0,0 +1,16 @@ +import pkgutil +from .base import BaseTester +import inspect + + +# load classes subclass of BaseCrawler +classes = [] +for loader, name, is_pkg in pkgutil.walk_packages(__path__): + module = loader.find_module(name).load_module(name) + for name, value in inspect.getmembers(module): + globals()[name] = value + if inspect.isclass(value) and issubclass(value, BaseTester) and value is not BaseTester \ + and not getattr(value, 'ignore', False): + classes.append(value) +__all__ = __ALL__ = classes + diff --git a/proxypool/testers/base.py b/proxypool/testers/base.py new file mode 100644 index 00000000..796b7cfc --- /dev/null +++ b/proxypool/testers/base.py @@ -0,0 +1,19 @@ +from proxypool.setting import TEST_DONT_SET_MAX_SCORE, PROXY_SCORE_INIT, PROXY_SCORE_MAX, PROXY_SCORE_MIN + + +class BaseTester(object): + test_url = "" + key = "" + test_dont_set_max_score = TEST_DONT_SET_MAX_SCORE + proxy_score_init = PROXY_SCORE_INIT + proxy_score_max = PROXY_SCORE_MAX + proxy_score_min = PROXY_SCORE_MIN + + def headers(self): + return None + + def cookies(self): + return None + + async def parse(self, html, url, proxy, expr='{"code":0'): + return True if expr in html else False From 7c77ad0b078702918882c1fd0976419642daad24 Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Sun, 30 Jun 2024 11:08:48 +0800 Subject: [PATCH 64/65] validate testing ip --- README.md | 11 ---------- build.yaml | 18 ---------------- docker-compose.yml | 5 ++--- proxypool/processors/tester.py | 39 +++++++++++++++++++++++----------- 4 files changed, 29 insertions(+), 44 deletions(-) delete mode 100644 build.yaml diff --git a/README.md b/README.md index d01d7a08..e1435763 100644 --- a/README.md +++ b/README.md @@ -74,12 +74,6 @@ proxypool | 2020-02-19 17:09:46,596 INFO success: tester entered RUNNING stat 这时候访问 [http://localhost:5555/random](http://localhost:5555/random) 即可获取一个随机可用代理。 -当然你也可以选择自己 Build,直接运行如下命令即可: - -``` -docker-compose -f build.yaml up -``` - 如果下载速度特别慢,可以自行修改 Dockerfile,修改: ```diff @@ -347,11 +341,6 @@ class Daili66Crawler(BaseCrawler): 本项目提供了 Kubernetes 部署脚本,如需部署到 Kubernetes,请参考 [kubernetes](./kubernetes)。 -## 待开发 - -- [ ] 前端页面管理 -- [ ] 使用情况统计分析 - 如有一起开发的兴趣可以在 Issue 留言,非常感谢! ## LICENSE diff --git a/build.yaml b/build.yaml deleted file mode 100644 index 74b2fd0b..00000000 --- a/build.yaml +++ /dev/null @@ -1,18 +0,0 @@ -version: "3" -services: - redis4proxypool: - image: redis:alpine - container_name: redis4proxypool - ports: - - "6374:6379" - proxypool: - build: . - image: "germey/proxypool:master" - container_name: proxypool - ports: - - "5555:5555" - restart: always - # volumes: - # - proxypool/crawlers/private:/app/proxypool/crawlers/private - environment: - PROXYPOOL_REDIS_CONNECTION_STRING: redis://@redis4proxypool:6379/0 diff --git a/docker-compose.yml b/docker-compose.yml index cf367f42..4e4d5936 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,16 +3,15 @@ services: redis4proxypool: image: redis:alpine container_name: redis4proxypool - # ports: - # - "6374:6379" proxypool: + build: . image: "germey/proxypool:master" container_name: proxypool ports: - "5555:5555" restart: always # volumes: - # - proxypool/crawlers/private:/app/proxypool/crawlers/private + # - proxypool/crawlers/private:~/proxypool/crawlers/private environment: PROXYPOOL_REDIS_HOST: redis4proxypool diff --git a/proxypool/processors/tester.py b/proxypool/processors/tester.py index 470259a9..6937af8c 100644 --- a/proxypool/processors/tester.py +++ b/proxypool/processors/tester.py @@ -45,27 +45,33 @@ async def test(self, proxy: Proxy): logger.debug(f'testing {proxy.string()}') # if TEST_ANONYMOUS is True, make sure that # the proxy has the effect of hiding the real IP + # logger.debug(f'TEST_ANONYMOUS {TEST_ANONYMOUS}') if TEST_ANONYMOUS: url = 'https://httpbin.org/ip' async with session.get(url, timeout=TEST_TIMEOUT) as response: resp_json = await response.json() origin_ip = resp_json['origin'] + # logger.debug(f'origin ip is {origin_ip}') async with session.get(url, proxy=f'http://{proxy.string()}', timeout=TEST_TIMEOUT) as response: resp_json = await response.json() anonymous_ip = resp_json['origin'] + logger.debug(f'anonymous ip is {anonymous_ip}') assert origin_ip != anonymous_ip assert proxy.host == anonymous_ip async with session.get(TEST_URL, proxy=f'http://{proxy.string()}', timeout=TEST_TIMEOUT, allow_redirects=False) as response: if response.status in TEST_VALID_STATUS: if TEST_DONT_SET_MAX_SCORE: - logger.debug(f'proxy {proxy.string()} is valid, remain current score') + logger.debug( + f'proxy {proxy.string()} is valid, remain current score') else: self.redis.max(proxy) - logger.debug(f'proxy {proxy.string()} is valid, set max score') + logger.debug( + f'proxy {proxy.string()} is valid, set max score') else: self.redis.decrease(proxy) - logger.debug(f'proxy {proxy.string()} is invalid, decrease score') + logger.debug( + f'proxy {proxy.string()} is invalid, decrease score') # if independent tester class found, create new set of storage and do the extra test for tester in self.testers: key = tester.key @@ -82,18 +88,25 @@ async def test(self, proxy: Proxy): is_valid = await tester.parse(resp_text, test_url, proxy.string()) if is_valid: if tester.test_dont_set_max_score: - logger.info(f'key[{key}] proxy {proxy.string()} is valid, remain current score') + logger.info( + f'key[{key}] proxy {proxy.string()} is valid, remain current score') else: - self.redis.max(proxy, key, tester.proxy_score_max) - logger.info(f'key[{key}] proxy {proxy.string()} is valid, set max score') + self.redis.max( + proxy, key, tester.proxy_score_max) + logger.info( + f'key[{key}] proxy {proxy.string()} is valid, set max score') else: - self.redis.decrease(proxy, tester.key, tester.proxy_score_min) - logger.info(f'key[{key}] proxy {proxy.string()} is invalid, decrease score') + self.redis.decrease( + proxy, tester.key, tester.proxy_score_min) + logger.info( + f'key[{key}] proxy {proxy.string()} is invalid, decrease score') except EXCEPTIONS: self.redis.decrease(proxy) - [self.redis.decrease(proxy, tester.key, tester.proxy_score_min) for tester in self.testers] - logger.debug(f'proxy {proxy.string()} is invalid, decrease score') + [self.redis.decrease(proxy, tester.key, tester.proxy_score_min) + for tester in self.testers] + logger.debug( + f'proxy {proxy.string()} is invalid, decrease score') @logger.catch def run(self): @@ -107,10 +120,12 @@ def run(self): logger.debug(f'{count} proxies to test') cursor = 0 while True: - logger.debug(f'testing proxies use cursor {cursor}, count {TEST_BATCH}') + logger.debug( + f'testing proxies use cursor {cursor}, count {TEST_BATCH}') cursor, proxies = self.redis.batch(cursor, count=TEST_BATCH) if proxies: - tasks = [self.loop.create_task(self.test(proxy)) for proxy in proxies] + tasks = [self.loop.create_task( + self.test(proxy)) for proxy in proxies] self.loop.run_until_complete(asyncio.wait(tasks)) if not cursor: break From 3ed558b2255d0790d51b838422163e5cdc0d414f Mon Sep 17 00:00:00 2001 From: Germey <cqc@cuiqingcai.com> Date: Sun, 30 Jun 2024 11:16:40 +0800 Subject: [PATCH 65/65] add proxy --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index e1435763..474bf65b 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,18 @@ 代理池原理解析可见「[如何搭建一个高效的代理池](https://cuiqingcai.com/7048.html)」,建议使用之前阅读。 +## 使用前注意 + +本代理池是基于市面上各种公开代理源搭建的,所以可用性并不高,很可能上百上千个代理中才能找到一两个可用代理,不适合直接用于爬虫爬取任务。 + +如果您的目的是为了尽快使用代理完成爬取任务,建议您对接一些付费代理或者直接使用已有代理资源;如果您的目的是为了学习如何搭建一个代理池,您可以参考本项目继续完成后续步骤。 + +付费代理推荐: + +- [ADSL 拨号代理](https://platform.acedata.cloud/documents/a82a528a-8e32-4c4c-a9d0-a21be7c9ef8c):海量拨号(中国境内)高质量代理 +- [海外/全球代理](https://platform.acedata.cloud/documents/50f1437a-1857-43c5-85cf-5800ae1b31e4):中国境外高质量代理 +- [蜂窝 4G/5G 代理](https://platform.acedata.cloud/documents/1cc59b19-1550-4169-a59d-ad6faf7f7517):极高质量(中国境内)防风控代理 + ## 使用准备 首先当然是克隆代码并进入 ProxyPool 文件夹: