diff --git a/.dockerignore b/.dockerignore index 803baf5e..3eab792f 100644 --- a/.dockerignore +++ b/.dockerignore @@ -128,4 +128,8 @@ venv.bak/ dmypy.json # Pyre type checker -.pyre/ \ No newline at end of file +.pyre/ + +proxypool/.env +.DS_Store +.vscode \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 183e5a79..fe439bc2 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -2,25 +2,44 @@ name: build on: push: branches: - - master + - master paths-ignore: - - .gitignore - - README.md - - '.github/ISSUE_TEMPLATE/**' + - .gitignore + - README.md + - '.github/ISSUE_TEMPLATE/**' + release: + types: [published] + jobs: build: runs-on: ubuntu-latest steps: - - name: Checkout Source - uses: actions/checkout@v1 - - name: Docker Login - run: docker login -u germey -p ${{ secrets.DOCKERHUB_LOGIN_PASSWORD }} - - name: Build the Docker Image - run: docker-compose build - - name: Push the Docker Image - run: docker-compose push - - name: Tag and Push Master Version - run: | - docker tag germey/proxypool germey/proxypool:master - docker push germey/proxypool:master + - name: Checkout + uses: actions/checkout@v2 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Login to DockerHub + uses: docker/login-action@v1 + with: + username: germey + password: ${{ secrets.DOCKERHUB_LOGIN_PASSWORD }} + - name: Get current date + id: date + run: echo "::set-output name=date::$(date +'%Y%m%d')" + + - name: Build and push + uses: docker/build-push-action@v2 + with: + context: . + push: true + platforms: linux/amd64 + tags: | + germey/proxypool:latest + germey/proxypool:master + germey/proxypool:${{ steps.date.outputs.date }} diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 8257ba0a..871642d7 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -35,7 +35,7 @@ jobs: echo $BUILD_NUMBER - name: Build Push Deploy run: | - docker-compose build + docker-compose -f build.yaml build docker tag germey/proxypool germey/proxypool:$BUILD_NUMBER docker push germey/proxypool docker push germey/proxypool:$BUILD_NUMBER diff --git a/.gitignore b/.gitignore index 7f21799f..16a7490c 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ *.db venv /.idea -*.log \ No newline at end of file +*.log +.DS_Store \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index dab1227e..c5ca5440 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,18 @@ -FROM python:3.6 +FROM python:3.7-alpine AS build +COPY requirements.txt . +RUN apk update &&\ + apk add --no-cache gcc g++ libffi-dev openssl-dev libxml2-dev libxslt-dev build-base musl-dev &&\ + pip install -U pip &&\ + pip install --timeout 30 --user --no-cache-dir --no-warn-script-location -r requirements.txt + +FROM python:3.7-alpine +ENV APP_ENV=prod +ENV LOCAL_PKG="/root/.local" +COPY --from=build ${LOCAL_PKG} ${LOCAL_PKG} +RUN apk update && apk add --no-cache libffi-dev openssl-dev libxslt-dev &&\ + ln -sf ${LOCAL_PKG}/bin/* /usr/local/bin/ WORKDIR /app COPY . . -RUN pip install -r requirements.txt +EXPOSE 5555 VOLUME ["/app/proxypool/crawlers/private"] -CMD ["supervisord", "-c", "supervisord.conf"] +ENTRYPOINT ["supervisord", "-c", "supervisord.conf"] \ No newline at end of file diff --git a/README.md b/README.md index ee7c108b..474bf65b 100644 --- a/README.md +++ b/README.md @@ -7,36 +7,57 @@ 简易高效的代理池,提供如下功能: -* 定时抓取免费代理网站,简易可扩展。 -* 使用 Redis 对代理进行存储并对代理可用性进行排序。 -* 定时测试和筛选,剔除不可用代理,留下可用代理。 -* 提供代理 API,随机取用测试通过的可用代理。 +- 定时抓取免费代理网站,简易可扩展。 +- 使用 Redis 对代理进行存储并对代理可用性进行排序。 +- 定时测试和筛选,剔除不可用代理,留下可用代理。 +- 提供代理 API,随机取用测试通过的可用代理。 代理池原理解析可见「[如何搭建一个高效的代理池](https://cuiqingcai.com/7048.html)」,建议使用之前阅读。 -## 运行示例 +## 使用前注意 -API Server 可以见[部署样例](https://universal.proxypool.cuiqingcai.com/),随机代理[取用地址](https://universal.proxypool.cuiqingcai.com/random),代理源比较少,仅供演示。 +本代理池是基于市面上各种公开代理源搭建的,所以可用性并不高,很可能上百上千个代理中才能找到一两个可用代理,不适合直接用于爬虫爬取任务。 -本样例为 GitHub Actions + Kubernetes 自动部署 master 分支代码结果。 +如果您的目的是为了尽快使用代理完成爬取任务,建议您对接一些付费代理或者直接使用已有代理资源;如果您的目的是为了学习如何搭建一个代理池,您可以参考本项目继续完成后续步骤。 + +付费代理推荐: + +- [ADSL 拨号代理](https://platform.acedata.cloud/documents/a82a528a-8e32-4c4c-a9d0-a21be7c9ef8c):海量拨号(中国境内)高质量代理 +- [海外/全球代理](https://platform.acedata.cloud/documents/50f1437a-1857-43c5-85cf-5800ae1b31e4):中国境外高质量代理 +- [蜂窝 4G/5G 代理](https://platform.acedata.cloud/documents/1cc59b19-1550-4169-a59d-ad6faf7f7517):极高质量(中国境内)防风控代理 + +## 使用准备 + +首先当然是克隆代码并进入 ProxyPool 文件夹: + +``` +git clone https://github.com/Python3WebSpider/ProxyPool.git +cd ProxyPool +``` + +然后选用下面 Docker 和常规方式任意一个执行即可。 ## 使用要求 -可以通过两种方式来运行代理池,一种方式是使用 Docker(推荐),另一种方式是常规方式运行。 +可以通过两种方式来运行代理池,一种方式是使用 Docker(推荐),另一种方式是常规方式运行,要求如下: ### Docker 如果使用 Docker,则需要安装如下环境: -* Docker -* Docker-Compose +- Docker +- Docker-Compose + +安装方法自行搜索即可。 + +官方 Docker Hub 镜像:[germey/proxypool](https://hub.docker.com/r/germey/proxypool) ### 常规方式 常规方式要求有 Python 环境、Redis 环境,具体要求如下: -* Python>=3.6 -* Redis +- Python>=3.6 +- Redis ## Docker 运行 @@ -65,6 +86,13 @@ proxypool | 2020-02-19 17:09:46,596 INFO success: tester entered RUNNING stat 这时候访问 [http://localhost:5555/random](http://localhost:5555/random) 即可获取一个随机可用代理。 +如果下载速度特别慢,可以自行修改 Dockerfile,修改: + +```diff +- RUN pip install -r requirements.txt ++ RUN pip install -r requirements.txt -i https://pypi.douban.com/simple +``` + ## 常规方式运行 如果不使用 Docker 运行,配置好 Python、Redis 环境之后也可运行,步骤如下。 @@ -80,31 +108,26 @@ proxypool | 2020-02-19 17:09:46,596 INFO success: tester entered RUNNING stat 设置 host、port、password,如果 password 为空可以设置为空字符串,示例如下: ```shell script -export REDIS_HOST='localhost' -export REDIS_PORT=6379 -export REDIS_PASSWORD='' -export REDIS_DB=0 +export PROXYPOOL_REDIS_HOST='localhost' +export PROXYPOOL_REDIS_PORT=6379 +export PROXYPOOL_REDIS_PASSWORD='' +export PROXYPOOL_REDIS_DB=0 ``` 或者只设置连接字符串: ```shell script -export REDIS_CONNECTION_STRING='redis://[password]@host:port/db' -``` - -如果没有密码也要设置为: - -```shell script -export REDIS_CONNECTION_STRING='redis://@host:port/db' +export PROXYPOOL_REDIS_CONNECTION_STRING='redis://localhost' ``` -这里连接字符串的格式需要符合 `redis://[password]@host:port/db` 的格式,注意不要遗漏 `@`。 +这里连接字符串的格式需要符合 `redis://[:password@]host[:port][/database]` 的格式, +中括号参数可以省略,port 默认是 6379,database 默认是 0,密码默认为空。 以上两种设置任选其一即可。 ### 安装依赖包 -这里强烈推荐使用 [Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands) +这里强烈推荐使用 [Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands) 或 [virtualenv](https://virtualenv.pypa.io/en/latest/user_guide.html) 创建虚拟环境,Python 版本不低于 3.6。 然后 pip 安装依赖即可: @@ -184,15 +207,15 @@ if __name__ == '__main__': ``` get random proxy 116.196.115.209:8080 { - "args": {}, + "args": {}, "headers": { - "Accept": "*/*", - "Accept-Encoding": "gzip, deflate", - "Host": "httpbin.org", - "User-Agent": "python-requests/2.22.0", + "Accept": "*/*", + "Accept-Encoding": "gzip, deflate", + "Host": "httpbin.org", + "User-Agent": "python-requests/2.22.0", "X-Amzn-Trace-Id": "Root=1-5e4d7140-662d9053c0a2e513c7278364" - }, - "origin": "116.196.115.209", + }, + "origin": "116.196.115.209", "url": "https://httpbin.org/get" } ``` @@ -205,41 +228,48 @@ get random proxy 116.196.115.209:8080 ### 开关 -* ENABLE_TESTER:允许 Tester 启动,默认 true -* ENABLE_GETTER:允许 Getter 启动,默认 true -* ENABLE_SERVER:运行 Server 启动,默认 true +- ENABLE_TESTER:允许 Tester 启动,默认 true +- ENABLE_GETTER:允许 Getter 启动,默认 true +- ENABLE_SERVER:运行 Server 启动,默认 true ### 环境 -* APP_ENV:运行环境,可以设置 dev、test、prod,即开发、测试、生产环境,默认 dev -* APP_DEBUG:调试模式,可以设置 true 或 false,默认 true +- APP_ENV:运行环境,可以设置 dev、test、prod,即开发、测试、生产环境,默认 dev +- APP_DEBUG:调试模式,可以设置 true 或 false,默认 true +- APP_PROD_METHOD: 正式环境启动应用方式,默认是`gevent`, + 可选:`tornado`,`meinheld`(分别需要安装 tornado 或 meinheld 模块) ### Redis 连接 -* REDIS_HOST:Redis 的 Host -* REDIS_PORT:Redis 的端口 -* REDIS_PASSWORD:Redis 的密码 -* REDIS_DB:Redis 的数据库索引,如 0、1 -* REDIS_CONNECTION_STRING:Redis 连接字符串 -* REDIS_KEY:Redis 储存代理使用字典的名称 +- PROXYPOOL_REDIS_HOST / REDIS_HOST:Redis 的 Host,其中 PROXYPOOL_REDIS_HOST 会覆盖 REDIS_HOST 的值。 +- PROXYPOOL_REDIS_PORT / REDIS_PORT:Redis 的端口,其中 PROXYPOOL_REDIS_PORT 会覆盖 REDIS_PORT 的值。 +- PROXYPOOL_REDIS_PASSWORD / REDIS_PASSWORD:Redis 的密码,其中 PROXYPOOL_REDIS_PASSWORD 会覆盖 REDIS_PASSWORD 的值。 +- PROXYPOOL_REDIS_DB / REDIS_DB:Redis 的数据库索引,如 0、1,其中 PROXYPOOL_REDIS_DB 会覆盖 REDIS_DB 的值。 +- PROXYPOOL_REDIS_CONNECTION_STRING / REDIS_CONNECTION_STRING:Redis 连接字符串,其中 PROXYPOOL_REDIS_CONNECTION_STRING 会覆盖 REDIS_CONNECTION_STRING 的值。 +- PROXYPOOL_REDIS_KEY / REDIS_KEY:Redis 储存代理使用字典的名称,其中 PROXYPOOL_REDIS_KEY 会覆盖 REDIS_KEY 的值。 ### 处理器 -* CYCLE_TESTER:Tester 运行周期,即间隔多久运行一次测试,默认 20 秒 -* CYCLE_GETTER:Getter 运行周期,即间隔多久运行一次代理获取,默认 100 秒 -* TEST_URL:测试 URL,默认百度 -* TEST_TIMEOUT:测试超时时间,默认 10 秒 -* TEST_BATCH:批量测试数量,默认 20 个代理 -* TEST_VALID_STATUS:测试有效的状态吗 -* API_HOST:代理 Server 运行 Host,默认 0.0.0.0 -* API_PORT:代理 Server 运行端口,默认 5555 -* API_THREADED:代理 Server 是否使用多线程,默认 true +- CYCLE_TESTER:Tester 运行周期,即间隔多久运行一次测试,默认 20 秒 +- CYCLE_GETTER:Getter 运行周期,即间隔多久运行一次代理获取,默认 100 秒 +- TEST_URL:测试 URL,默认百度 +- TEST_TIMEOUT:测试超时时间,默认 10 秒 +- TEST_BATCH:批量测试数量,默认 20 个代理 +- TEST_VALID_STATUS:测试有效的状态码 +- API_HOST:代理 Server 运行 Host,默认 0.0.0.0 +- API_PORT:代理 Server 运行端口,默认 5555 +- API_THREADED:代理 Server 是否使用多线程,默认 true ### 日志 -* LOG_DIR:日志相对路径 -* LOG_RUNTIME_FILE:运行日志文件名称 -* LOG_ERROR_FILE:错误日志文件名称 +- LOG_DIR:日志相对路径 +- LOG_RUNTIME_FILE:运行日志文件名称 +- LOG_ERROR_FILE:错误日志文件名称 +- LOG_ROTATION: 日志记录周转周期或大小,默认 500MB,见 [loguru - rotation](https://github.com/Delgan/loguru#easier-file-logging-with-rotation--retention--compression) +- LOG_RETENTION: 日志保留日期,默认 7 天,见 [loguru - retention](https://github.com/Delgan/loguru#easier-file-logging-with-rotation--retention--compression) +- ENABLE_LOG_FILE:是否输出 log 文件,默认 true,如果设置为 false,那么 ENABLE_LOG_RUNTIME_FILE 和 ENABLE_LOG_ERROR_FILE 都不会生效 +- ENABLE_LOG_RUNTIME_FILE:是否输出 runtime log 文件,默认 true +- ENABLE_LOG_ERROR_FILE:是否输出 error log 文件,默认 true 以上内容均可使用环境变量配置,即在运行前设置对应环境变量值即可,如更改测试地址和 Redis 键名: @@ -253,7 +283,7 @@ export REDIS_KEY=proxies:weibo 如果使用 Docker-Compose 启动代理池,则需要在 docker-compose.yml 文件里面指定环境变量,如: ```yaml -version: '3' +version: "3" services: redis: image: redis:alpine @@ -264,7 +294,7 @@ services: restart: always proxypool: build: . - image: 'germey/proxypool' + image: "germey/proxypool" container_name: proxypool ports: - "5555:5555" @@ -296,7 +326,7 @@ class Daili66Crawler(BaseCrawler): daili66 crawler, http://www.66ip.cn/1.html """ urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)] - + def parse(self, html): """ parse html file to get proxies @@ -312,8 +342,8 @@ class Daili66Crawler(BaseCrawler): 在这里只需要定义一个 Crawler 继承 BaseCrawler 即可,然后定义好 urls 变量和 parse 方法即可。 -* urls 变量即为爬取的代理网站网址列表,可以用程序定义也可写成固定内容。 -* parse 方法接收一个参数即 html,代理网址的 html,在 parse 方法里只需要写好 html 的解析,解析出 host 和 port,并构建 Proxy 对象 yield 返回即可。 +- urls 变量即为爬取的代理网站网址列表,可以用程序定义也可写成固定内容。 +- parse 方法接收一个参数即 html,代理网址的 html,在 parse 方法里只需要写好 html 的解析,解析出 host 和 port,并构建 Proxy 对象 yield 返回即可。 网页的爬取不需要实现,BaseCrawler 已经有了默认实现,如需更改爬取方式,重写 crawl 方法即可。 @@ -321,16 +351,7 @@ class Daili66Crawler(BaseCrawler): ## 部署 -本项目提供了 Kubernetes 部署脚本,如需部署到 Kubernetes,执行如下命令即可: - -```shell script -cat deployment.yml | sed 's/\${TAG}/latest/g' | kubectl apply -f - -``` - -## 待开发 - -- [ ] 前端页面管理 -- [ ] 使用情况统计分析 +本项目提供了 Kubernetes 部署脚本,如需部署到 Kubernetes,请参考 [kubernetes](./kubernetes)。 如有一起开发的兴趣可以在 Issue 留言,非常感谢! diff --git a/deployment.yml b/deployment.yml deleted file mode 100644 index c7aaea55..00000000 --- a/deployment.yml +++ /dev/null @@ -1,99 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - creationTimestamp: null - name: proxypool ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: proxypool - namespace: proxypool -spec: - storageClassName: azure-file - accessModes: - - ReadWriteMany - resources: - requests: - storage: 2Gi ---- -apiVersion: v1 -items: - - apiVersion: v1 - kind: Service - metadata: - annotations: - kompose.cmd: kompose convert -f docker-compose.yml -o deployment.yml - kompose.version: 1.20.0 () - creationTimestamp: null - labels: - io.kompose.service: proxypool - name: proxypool - namespace: proxypool - spec: - ports: - - name: "5555" - port: 5555 - targetPort: 5555 - selector: - io.kompose.service: proxypool - status: - loadBalancer: {} - - apiVersion: apps/v1 - kind: Deployment - metadata: - annotations: - kompose.cmd: kompose convert -f docker-compose.yml -o deployment.yml - kompose.version: 1.20.0 () - creationTimestamp: null - labels: - io.kompose.service: proxypool - name: proxypool - namespace: proxypool - spec: - replicas: 2 - revisionHistoryLimit: 1 - strategy: {} - selector: - matchLabels: - io.kompose.service: proxypool - template: - metadata: - annotations: - kompose.cmd: kompose convert -f docker-compose.yml -o deployment.yml - kompose.version: 1.20.0 () - creationTimestamp: null - labels: - io.kompose.service: proxypool - spec: - containers: - - env: - - name: REDIS_CONNECTION_STRING - valueFrom: - secretKeyRef: - name: redis - key: connection_string - - name: REDIS_PORT - value: '6379' - image: germey/proxypool:${TAG} - name: proxypool - resources: - limits: - memory: "500Mi" - cpu: "300m" - requests: - memory: "500Mi" - cpu: "300m" - ports: - - containerPort: 5555 - volumeMounts: - - mountPath: "/app/proxypool/crawlers/private" - name: proxypool - restartPolicy: Always - volumes: - - name: proxypool - persistentVolumeClaim: - claimName: pvc-proxypool - status: {} -kind: List -metadata: {} diff --git a/docker-compose.yml b/docker-compose.yml index f0181db2..4e4d5936 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,20 +1,17 @@ -version: '3' +version: "3" services: - redis: + redis4proxypool: image: redis:alpine - container_name: redis - command: redis-server - ports: - - "6379:6379" - restart: always + container_name: redis4proxypool proxypool: build: . - image: 'germey/proxypool' + image: "germey/proxypool:master" container_name: proxypool ports: - "5555:5555" restart: always - volumes: - - /tmp/proxypool/crawlers/private:/app/proxypool/crawlers/private + # volumes: + # - proxypool/crawlers/private:~/proxypool/crawlers/private environment: - REDIS_HOST: redis \ No newline at end of file + PROXYPOOL_REDIS_HOST: redis4proxypool + diff --git a/examples/usage.py b/examples/usage.py index 87c73b35..bc699ba9 100644 --- a/examples/usage.py +++ b/examples/usage.py @@ -2,7 +2,7 @@ proxypool_url = 'http://127.0.0.1:5555/random' -target_url = 'http://httpbin.org/get' +target_url = 'https://antispider5.scrape.center/' def get_random_proxy(): diff --git a/examples/usage2.py b/examples/usage2.py new file mode 100644 index 00000000..918c5eb2 --- /dev/null +++ b/examples/usage2.py @@ -0,0 +1,95 @@ +# -*- coding: UTF-8 -*- + +''' +''' +import requests +import time +import threading +import urllib3 +from fake_headers import Headers +import uuid +from geolite2 import geolite2 +ips = [] + +# 爬数据的线程类 + +def getChinaIP(ip='127.0.0.1'): + reader = geolite2.reader() + ip_info = reader.get(ip) + geolite2.close() + print(ip_info) + return True if ip_info['country']['iso_code'] == 'CN' else False + + + +class CrawlThread(threading.Thread): + def __init__(self, proxyip): + super(CrawlThread, self).__init__() + self.proxyip = proxyip + + def run(self): + # 开始计时 + pure_ip_address = self.proxyip.split(':')[0] + # 验证IP归属 + if not getChinaIP(pure_ip_address): + # pass + raise ValueError('不是有效IP') + # + start = time.time() + # 消除关闭证书验证的警告 + urllib3.disable_warnings() + headers = Headers(headers=True).generate() + headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676' + headers['Pragma'] = 'no-cache' + headers['Host'] = 'bb.cf08tp.cn' + headers['x-forward-for'] = pure_ip_address + headers['Cookie'] = 'PHPSESSID={}'.format( + ''.join(str(uuid.uuid1()).split('-'))) + print(headers) + html = requests.get(headers=headers, url=targetUrl, proxies={ + "http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode() + # 结束计时 + end = time.time() + # 输出内容 + print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) + + "毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************") + +# 获取代理IP的线程类 + + +class GetIpThread(threading.Thread): + def __init__(self, fetchSecond): + super(GetIpThread, self).__init__() + self.fetchSecond = fetchSecond + + def run(self): + global ips + while True: + # 获取IP列表 + res = requests.get(apiUrl).content.decode() + # 按照\n分割获取到的IP + ips = res.split('\n') + # 利用每一个IP + for proxyip in ips: + if proxyip.strip(): + # 开启一个线程 + # CrawlThread(proxyip).start() + try: + CrawlThread(proxyip).run() + time.sleep(1.5) + except Exception as e: + print(e) + # 休眠 + time.sleep(len(ips) /self.fetchSecond ) + + +if __name__ == '__main__': + # 获取IP的API接口 + # apiUrl = "http://127.0.0.1:5555/all" + apiUrl = "http://127.0.0.1:5555/random" + # 要抓取的目标网站地址 + targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp=" + # targetUrl = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335608&id=2676&tp=' + fetchSecond = 5 + # 开始自动获取IP + GetIpThread(fetchSecond).start() diff --git a/ingress.yml b/ingress.yml deleted file mode 100644 index 166eb729..00000000 --- a/ingress.yml +++ /dev/null @@ -1,32 +0,0 @@ ---- -apiVersion: networking.k8s.io/v1beta1 -kind: Ingress -metadata: - name: ingress-universal-proxypool - namespace: proxypool - annotations: - nginx.ingress.kubernetes.io/ssl-redirect: "true" - nginx.ingress.kubernetes.io/rewrite-target: / -spec: - tls: - - hosts: - - universal.proxypool.cuiqingcai.com - secretName: tls-wildcard-proxypool-cuiqingcai-com - - hosts: - - proxypool.scrape.center - secretName: tls-wildcard-scrape-center - rules: - - host: universal.proxypool.cuiqingcai.com - http: - paths: - - backend: - serviceName: proxypool - servicePort: 5555 - path: / - - host: proxypool.scrape.center - http: - paths: - - backend: - serviceName: proxypool - servicePort: 5555 - path: / \ No newline at end of file diff --git a/kubernetes/.helmignore b/kubernetes/.helmignore new file mode 100644 index 00000000..9716c30e --- /dev/null +++ b/kubernetes/.helmignore @@ -0,0 +1,24 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ +image/ \ No newline at end of file diff --git a/kubernetes/Chart.yaml b/kubernetes/Chart.yaml new file mode 100644 index 00000000..58db2bc2 --- /dev/null +++ b/kubernetes/Chart.yaml @@ -0,0 +1,27 @@ +apiVersion: v2 +name: proxypool +description: A Efficient Proxy Pool + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# Keywords about this application. +keywords: + - proxypool + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +appVersion: 1.16.0 diff --git a/kubernetes/README.md b/kubernetes/README.md new file mode 100644 index 00000000..327880df --- /dev/null +++ b/kubernetes/README.md @@ -0,0 +1,42 @@ +# Kubernetes 部署 + +这是用来快速部署本代理池的 Helm Charts。 + +首先需要有一个 Kubernetes 集群,其次需要安装 Helm,确保 helm 命令可以正常运行。 + +安装参考: + +- Kubernetes:[https://setup.scrape.center/kubernetes](https://setup.scrape.center/kubernetes)。 +- Helm: [https://setup.scrape.center/helm](https://setup.scrape.center/helm)。 + +## 安装 + +安装直接使用 helm 命令在本文件夹运行即可,使用 `-n` 可以制定 NameSpace。 + +```shell +helm install proxypool-app . -n scrape +``` + +其中 proxypool-app 就是应用的名字,可以任意取名,它会用作代理池 Deplyment 的名称。 + +如果需要覆盖变量,可以修改 values.yaml 文件,执行如下命令安装: + +```shell +helm install proxypool-app . -f values.yaml -n scrape +``` + +## 更新 + +如果需要更新配置,可以修改 values.yaml 文件,执行如下命令更新版本: + +```shell +helm upgrade proxypool-app . -f values.yaml -n scrape +``` + +## 卸载 + +如果不想使用了,可以只用 uninstall 命令卸载: + +```shell +helm uninstall proxypool-app -n scrape +``` diff --git a/kubernetes/templates/_helpers.tpl b/kubernetes/templates/_helpers.tpl new file mode 100644 index 00000000..31911df1 --- /dev/null +++ b/kubernetes/templates/_helpers.tpl @@ -0,0 +1,53 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "proxypool.name" -}} +{{- default .Chart.Name .Values.name | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "proxypool.fullname" -}} +{{- if .Values.fullname }} +{{- .Values.fullname | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.name }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "proxypool.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "proxypool.labels" -}} +helm.sh/chart: {{ include "proxypool.chart" . }} +{{ include "proxypool.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "proxypool.selectorLabels" -}} +app.kubernetes.io/name: {{ include "proxypool.fullname" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} diff --git a/kubernetes/templates/proxypool-deployment.yaml b/kubernetes/templates/proxypool-deployment.yaml new file mode 100644 index 00000000..a12854d9 --- /dev/null +++ b/kubernetes/templates/proxypool-deployment.yaml @@ -0,0 +1,37 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "proxypool.fullname" . }} + labels: + {{- include "proxypool.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.deployment.replicas }} + revisionHistoryLimit: {{ .Values.deployment.revisionHistoryLimit }} + selector: + matchLabels: + {{- include "proxypool.labels" . | nindent 8 }} + template: + metadata: + labels: + {{- include "proxypool.labels" . | nindent 8 }} + spec: + restartPolicy: {{ .Values.deployment.restartPolicy }} + containers: + - name: {{ include "proxypool.fullname" . }} + image: {{ .Values.deployment.image }} + ports: + - containerPort: 5555 + protocol: TCP + imagePullPolicy: {{ .Values.deployment.imagePullPolicy }} + livenessProbe: + httpGet: + path: /random + port: 5555 + initialDelaySeconds: 60 + periodSeconds: 5 + failureThreshold: 5 + timeoutSeconds: 10 + resources: + {{- toYaml .Values.deployment.resources | nindent 12 }} + env: + {{- toYaml .Values.deployment.env | nindent 12 }} diff --git a/kubernetes/templates/proxypool-ingress.yaml b/kubernetes/templates/proxypool-ingress.yaml new file mode 100644 index 00000000..0706f5d2 --- /dev/null +++ b/kubernetes/templates/proxypool-ingress.yaml @@ -0,0 +1,41 @@ +{{- if .Values.ingress.enabled -}} +{{- $fullName := include "proxypool.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +{{- if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1beta1 +{{- else -}} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $fullName }} + labels: + {{- include "proxypool.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ . }} + backend: + serviceName: {{ $fullName }} + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- end }} diff --git a/kubernetes/templates/proxypool-service.yaml b/kubernetes/templates/proxypool-service.yaml new file mode 100644 index 00000000..3d4285b4 --- /dev/null +++ b/kubernetes/templates/proxypool-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "proxypool.fullname" . }} + labels: + {{- include "proxypool.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: 5555 + protocol: TCP + name: http + selector: + {{- include "proxypool.selectorLabels" . | nindent 4 }} diff --git a/kubernetes/templates/redis-deployment.yaml b/kubernetes/templates/redis-deployment.yaml new file mode 100644 index 00000000..4acf4351 --- /dev/null +++ b/kubernetes/templates/redis-deployment.yaml @@ -0,0 +1,30 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: proxypool-redis + name: proxypool-redis +spec: + replicas: 1 + revisionHistoryLimit: 1 + selector: + matchLabels: + app: proxypool-redis + template: + metadata: + labels: + app: proxypool-redis + spec: + containers: + - image: redis:alpine + name: proxypool-redis + ports: + - containerPort: 6379 + resources: + limits: + memory: "100Mi" + cpu: "100m" + requests: + memory: "100Mi" + cpu: "100m" + restartPolicy: Always diff --git a/kubernetes/templates/redis-service.yaml b/kubernetes/templates/redis-service.yaml new file mode 100644 index 00000000..5dbda554 --- /dev/null +++ b/kubernetes/templates/redis-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app: proxypool-redis + name: proxypool-redis +spec: + ports: + - name: "6379" + port: 6379 + targetPort: 6379 + selector: + app: proxypool-redis \ No newline at end of file diff --git a/kubernetes/values.yaml b/kubernetes/values.yaml new file mode 100644 index 00000000..15b25377 --- /dev/null +++ b/kubernetes/values.yaml @@ -0,0 +1,39 @@ +name: proxypool +fullname: proxypool-app + +deployment: + image: germey/proxypool:master + imagePullPolicy: Always + restartPolicy: Always + revisionHistoryLimit: 2 + successfulJobsHistoryLimit: 1 + replicas: 1 + resources: + limits: + memory: "200Mi" + cpu: "80m" + requests: + memory: "200Mi" + cpu: "80m" + env: + - name: PROXYPOOL_REDIS_HOST + value: "proxypool-redis" + - name: PROXYPOOL_REDIS_PORT + value: "6379" + +service: + type: ClusterIP + port: 80 + +ingress: + enabled: true + annotations: + kubernetes.io/ingress.class: nginx + hosts: + - host: proxypool.scrape.center + paths: + - "/" + tls: + - secretName: tls-wildcard-scrape-center + hosts: + - proxypool.scrape.center diff --git a/proxypool/crawlers/base.py b/proxypool/crawlers/base.py index aa35430e..611a816f 100644 --- a/proxypool/crawlers/base.py +++ b/proxypool/crawlers/base.py @@ -1,32 +1,49 @@ -from retrying import retry +from retrying import RetryError, retry import requests from loguru import logger from proxypool.setting import GET_TIMEOUT +from fake_headers import Headers +import time class BaseCrawler(object): urls = [] - + @retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None, wait_fixed=2000) def fetch(self, url, **kwargs): try: + headers = Headers(headers=True).generate() kwargs.setdefault('timeout', GET_TIMEOUT) kwargs.setdefault('verify', False) + kwargs.setdefault('headers', headers) response = requests.get(url, **kwargs) if response.status_code == 200: response.encoding = 'utf-8' return response.text - except requests.ConnectionError: + except (requests.ConnectionError, requests.ReadTimeout): return - - @logger.catch + + def process(self, html, url): + """ + used for parse html + """ + for proxy in self.parse(html): + logger.info(f'fetched proxy {proxy.string()} from {url}') + yield proxy + def crawl(self): """ crawl main method """ - for url in self.urls: - logger.info(f'fetching {url}') - html = self.fetch(url) - for proxy in self.parse(html): - logger.info(f'fetched proxy {proxy.string()} from {url}') - yield proxy + try: + for url in self.urls: + logger.info(f'fetching {url}') + html = self.fetch(url) + if not html: + continue + time.sleep(.5) + yield from self.process(html, url) + except RetryError: + logger.error( + f'crawler {self} crawled proxy unsuccessfully, ' + 'please check if target url is valid or network issue') diff --git a/proxypool/crawlers/public/daili66.py b/proxypool/crawlers/public/daili66.py index 09a3ee45..aec7ea68 100644 --- a/proxypool/crawlers/public/daili66.py +++ b/proxypool/crawlers/public/daili66.py @@ -4,7 +4,7 @@ BASE_URL = 'http://www.66ip.cn/{page}.html' -MAX_PAGE = 5 +MAX_PAGE = 3 class Daili66Crawler(BaseCrawler): diff --git a/proxypool/crawlers/public/data5u.py b/proxypool/crawlers/public/data5u.py index e36bf664..62158c20 100644 --- a/proxypool/crawlers/public/data5u.py +++ b/proxypool/crawlers/public/data5u.py @@ -11,23 +11,7 @@ class Data5UCrawler(BaseCrawler): data5u crawler, http://www.data5u.com """ urls = [BASE_URL] - - headers = { - 'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36' - } - @logger.catch - def crawl(self): - """ - crawl main method - """ - for url in self.urls: - logger.info(f'fetching {url}') - html = self.fetch(url, headers=self.headers) - for proxy in self.parse(html): - logger.info(f'fetched proxy {proxy.string()} from {url}') - yield proxy - def parse(self, html): """ parse html file to get proxies diff --git a/proxypool/crawlers/public/docip.py b/proxypool/crawlers/public/docip.py new file mode 100644 index 00000000..154871fb --- /dev/null +++ b/proxypool/crawlers/public/docip.py @@ -0,0 +1,38 @@ +import time +from retrying import RetryError +from loguru import logger +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import json + +BASE_URL = 'https://www.docip.net/data/free.json?t={date}' + + + +class DocipCrawler(BaseCrawler): + """ + Docip crawler, https://www.docip.net/data/free.json + """ + urls = [BASE_URL.format(date=time.strftime("%Y%m%d", time.localtime()))] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + try: + result = json.loads(html) + proxy_list = result['data'] + for proxy_item in proxy_list: + host = proxy_item['ip'] + port = host.split(':')[-1] + yield Proxy(host=host, port=port) + except json.JSONDecodeError: + print("json.JSONDecodeError") + return + + +if __name__ == '__main__': + crawler = DocipCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/fatezero.py b/proxypool/crawlers/public/fatezero.py new file mode 100644 index 00000000..681cf9e4 --- /dev/null +++ b/proxypool/crawlers/public/fatezero.py @@ -0,0 +1,31 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import re +import json +BASE_URL = 'http://proxylist.fatezero.org/proxy.list' + + +class FatezeroCrawler(BaseCrawler): + """ + Fatezero crawler,http://proxylist.fatezero.org + """ + urls = [BASE_URL] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + + hosts_ports = html.split('\n') + for addr in hosts_ports: + if(addr): + ip_address = json.loads(addr) + host = ip_address['host'] + port = ip_address['port'] + yield Proxy(host=host, port=port) + +if __name__ == '__main__': + crawler = FatezeroCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/geonodedaili.py b/proxypool/crawlers/public/geonodedaili.py new file mode 100644 index 00000000..f71f16ec --- /dev/null +++ b/proxypool/crawlers/public/geonodedaili.py @@ -0,0 +1,71 @@ +import time +from retrying import RetryError +from loguru import logger +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import json + +BASE_URL = 'https://proxylist.geonode.com/api/proxy-list?limit=500&page={page}&sort_by=lastChecked&sort_type=desc' +MAX_PAGE = 18 + + +class GeonodeCrawler(BaseCrawler): + """ + Geonode crawler, https://proxylist.geonode.com/ + """ + urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + try: + result = json.loads(html) + proxy_list = result['data'] + for proxy_item in proxy_list: + host = proxy_item['ip'] + port = proxy_item['port'] + yield Proxy(host=host, port=port) + except json.JSONDecodeError: + print("json.JSONDecodeError") + return + + def crawl(self): + """ + override crawl main method + add headers + """ + headers = { + 'authority': 'proxylist.geonode.com', + 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"', + 'accept': 'application/json, text/plain, */*', + 'sec-ch-ua-mobile': '?0', + 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36', + 'sec-ch-ua-platform': '"macOS"', + 'origin': 'https://geonode.com', + 'sec-fetch-site': 'same-site', + 'sec-fetch-mode': 'cors', + 'sec-fetch-dest': 'empty', + 'referer': 'https://geonode.com/', + 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7', + 'if-none-match': 'W/"c25d-BXjLTmP+/yYXtIz4OEcmdOWSv88"', + } + try: + for url in self.urls: + logger.info(f'fetching {url}') + html = self.fetch(url, headers=headers) + if not html: + continue + time.sleep(.5) + yield from self.process(html, url) + except RetryError: + logger.error( + f'crawler {self} crawled proxy unsuccessfully, ' + 'please check if target url is valid or network issue') + + +if __name__ == '__main__': + crawler = GeonodeCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/goubanjia.py b/proxypool/crawlers/public/goubanjia.py new file mode 100644 index 00000000..57157858 --- /dev/null +++ b/proxypool/crawlers/public/goubanjia.py @@ -0,0 +1,44 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import re +from pyquery import PyQuery as pq +import time +BASE_URL = 'http://www.goubanjia.com/' + + +class GoubanjiaCrawler(BaseCrawler): + """ + ip Goubanjia crawler, http://www.goubanjia.com/ + """ + urls = [BASE_URL] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html)('.ip').items() + # ''.join([*filter(lambda x: x != '',re.compile('\>([\d:\.]*)\<').findall(td.html()))]) + for td in doc: + trs = td.children() + ip_str = '' + for tr in trs: + attrib = tr.attrib + if 'style' in attrib and 'none' in tr.attrib['style']: + continue + ip_str+= '' if not tr.text else tr.text + addr_split = ip_str.split(':') + if(len(addr_split) == 2): + host = addr_split[0] + port = addr_split[1] + yield Proxy(host=host, port=port) + else: + port = trs[-1].text + host = ip_str.replace(port,'') + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = GoubanjiaCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/ihuan.py b/proxypool/crawlers/public/ihuan.py new file mode 100644 index 00000000..4ca5e529 --- /dev/null +++ b/proxypool/crawlers/public/ihuan.py @@ -0,0 +1,36 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import re +from pyquery import PyQuery as pq +import time +BASE_URL = 'https://ip.ihuan.me/today/{path}.html' + + +class IhuanCrawler(BaseCrawler): + """ + ip ihuan crawler, https://ip.ihuan.me + """ + path = time.strftime("%Y/%m/%d/%H", time.localtime()) + urls = [BASE_URL.format(path=path)] + ignore = False + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + # doc = pq(html)('.text-left') + ip_address = re.compile('([\d:\.]*).*?<br>') + hosts_ports = ip_address.findall(html) + for addr in hosts_ports: + addr_split = addr.split(':') + if(len(addr_split) == 2): + host = addr_split[0] + port = addr_split[1] + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = IhuanCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/ip3366.py b/proxypool/crawlers/public/ip3366.py index 78d29447..dfbc06f2 100644 --- a/proxypool/crawlers/public/ip3366.py +++ b/proxypool/crawlers/public/ip3366.py @@ -3,15 +3,15 @@ import re -MAX_PAGE = 5 -BASE_URL = 'http://www.ip3366.net/free/?stype=1&page={page}' +MAX_PAGE = 3 +BASE_URL = 'http://www.ip3366.net/free/?stype={stype}&page={page}' class IP3366Crawler(BaseCrawler): """ ip3366 crawler, http://www.ip3366.net/ """ - urls = [BASE_URL.format(page=i) for i in range(1, 8)] + urls = [BASE_URL.format(stype=stype,page=i) for stype in range(1,3) for i in range(1, 8)] def parse(self, html): """ diff --git a/proxypool/crawlers/public/ip89.py b/proxypool/crawlers/public/ip89.py new file mode 100644 index 00000000..f67c3870 --- /dev/null +++ b/proxypool/crawlers/public/ip89.py @@ -0,0 +1,33 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import re + +MAX_NUM = 9999 +BASE_URL = 'http://api.89ip.cn/tqdl.html?api=1&num={MAX_NUM}&port=&address=&isp='.format(MAX_NUM=MAX_NUM) + + +class Ip89Crawler(BaseCrawler): + """ + 89ip crawler, http://api.89ip.cn + """ + urls = [BASE_URL] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + ip_address = re.compile('([\d:\.]*)<br>') + hosts_ports = ip_address.findall(html) + for addr in hosts_ports: + addr_split = addr.split(':') + if(len(addr_split) == 2): + host = addr_split[0] + port = addr_split[1] + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = Ip89Crawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/jiangxianli.py b/proxypool/crawlers/public/jiangxianli.py new file mode 100644 index 00000000..861dd1e5 --- /dev/null +++ b/proxypool/crawlers/public/jiangxianli.py @@ -0,0 +1,39 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import json + + +BASE_URL = 'https://ip.jiangxianli.com/api/proxy_ips?page={page}' + +MAX_PAGE = 3 + + +class JiangxianliCrawler(BaseCrawler): + """ + jiangxianli crawler,https://ip.jiangxianli.com/ + """ + + urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + + result = json.loads(html) + if result['code'] != 0: + return + MAX_PAGE = int(result['data']['last_page']) + hosts_ports = result['data']['data'] + for ip_address in hosts_ports: + if(ip_address): + host = ip_address['ip'] + port = ip_address['port'] + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = JiangxianliCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/kuaidaili.py b/proxypool/crawlers/public/kuaidaili.py index f3fa6437..3602833e 100644 --- a/proxypool/crawlers/public/kuaidaili.py +++ b/proxypool/crawlers/public/kuaidaili.py @@ -4,15 +4,15 @@ from pyquery import PyQuery as pq -BASE_URL = 'https://www.kuaidaili.com/free/inha/{page}/' -MAX_PAGE = 5 +BASE_URL = 'https://www.kuaidaili.com/free/{type}/{page}/' +MAX_PAGE = 3 class KuaidailiCrawler(BaseCrawler): """ kuaidaili crawler, https://www.kuaidaili.com/ """ - urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)] + urls = [BASE_URL.format(type=type,page=page) for type in ('intr','inha') for page in range(1, MAX_PAGE + 1)] def parse(self, html): """ diff --git a/proxypool/crawlers/public/seofangfa.py b/proxypool/crawlers/public/seofangfa.py new file mode 100644 index 00000000..1f5a20a2 --- /dev/null +++ b/proxypool/crawlers/public/seofangfa.py @@ -0,0 +1,34 @@ +import requests +from pyquery import PyQuery as pq + +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler + +requests.packages.urllib3.disable_warnings() +BASE_URL = "https://proxy.seofangfa.com/" +MAX_PAGE = 1 + + +class SeoFangFaCrawler(BaseCrawler): + """ + seo方法 crawler, https://proxy.seofangfa.com/ + """ + urls = ["https://proxy.seofangfa.com/"] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + trs = doc('.table tr:gt(0)').items() + for tr in trs: + host = tr.find('td:nth-child(1)').text() + port = int(tr.find('td:nth-child(2)').text()) + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = SeoFangFaCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/taiyangdaili.py b/proxypool/crawlers/public/taiyangdaili.py new file mode 100644 index 00000000..b42388cc --- /dev/null +++ b/proxypool/crawlers/public/taiyangdaili.py @@ -0,0 +1,31 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +from pyquery import PyQuery as pq + +BaseUrl = 'http://www.taiyanghttp.com/free/page{num}' +MAX_PAGE = 3 + + +class TaiyangdailiCrawler(BaseCrawler): + """ + taiyangdaili crawler, http://www.taiyanghttp.com/free/ + """ + urls = [BaseUrl.format(num=i) for i in range(1, 6)] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + trs = doc('#ip_list .tr.ip_tr').items() + for tr in trs: + host = tr.find('div:nth-child(1)').text() + port = tr.find('div:nth-child(2)').text() + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = TaiyangdailiCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/uqidata.py b/proxypool/crawlers/public/uqidata.py new file mode 100644 index 00000000..3e54b2dc --- /dev/null +++ b/proxypool/crawlers/public/uqidata.py @@ -0,0 +1,49 @@ +from pyquery import PyQuery as pq +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +from loguru import logger + +BASE_URL = 'https://ip.uqidata.com/free/index.html' + + +class UqidataCrawler(BaseCrawler): + """ + Uqidata crawler, https://ip.uqidata.com/free/index.html + """ + urls = [BASE_URL] + ignore = True + + def encode(input_str): + tmp = [] + for i in range(len(input_str)): + tmp.append("ABCDEFGHIZ".find(input_str[i])) + result = "".join(str(i) for i in tmp) + result = int(result) >> 0x03 + return result + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + trs = doc('#main_container .inner table tbody tr:nth-child(n+3)').items() + for tr in trs: + ip_html = tr('td.ip').find("*").items() + host = '' + for i in ip_html: + if i.attr('style') is not None and 'none' in i.attr('style'): + continue + if i.text() == '': + continue + host += i.text() + + port_code = tr('td.port').attr('class').split(' ')[1] + port = UqidataCrawler.encode(port_code) + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = UqidataCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/xiaoshudaili.py b/proxypool/crawlers/public/xiaoshudaili.py new file mode 100644 index 00000000..f6fd0869 --- /dev/null +++ b/proxypool/crawlers/public/xiaoshudaili.py @@ -0,0 +1,54 @@ +import re +from pyquery import PyQuery as pq +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler + +BASE_URL = "http://www.xsdaili.cn/" +PAGE_BASE_URL = "http://www.xsdaili.cn/dayProxy/ip/{page}.html" +MAX_PAGE = 3 + + +class XiaoShuCrawler(BaseCrawler): + """ + 小舒代理 crawler, http://www.xsdaili.cn/ + """ + + def __init__(self): + """ + init urls + """ + try: + html = self.fetch(url=BASE_URL) + except: + self.urls = [] + return + doc = pq(html) + title = doc(".title:eq(0) a").items() + latest_page = 0 + for t in title: + res = re.search(r"/(\d+)\.html", t.attr("href")) + latest_page = int(res.group(1)) if res else 0 + if latest_page: + self.urls = [PAGE_BASE_URL.format(page=page) for page in range( + latest_page - MAX_PAGE, latest_page)] + else: + self.urls = [] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + contents = doc('.cont').text() + contents = contents.split("\n") + for content in contents: + c = content[:content.find("@")] + host, port = c.split(":") + yield Proxy(host=host, port=int(port)) + + +if __name__ == '__main__': + crawler = XiaoShuCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/xicidaili.py b/proxypool/crawlers/public/xicidaili.py index fdd2a317..53a4872e 100644 --- a/proxypool/crawlers/public/xicidaili.py +++ b/proxypool/crawlers/public/xicidaili.py @@ -12,23 +12,7 @@ class XicidailiCrawler(BaseCrawler): """ urls = [BASE_URL] ignore = True - - headers = { - 'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36' - } - @logger.catch - def crawl(self): - """ - crawl main method - """ - for url in self.urls: - logger.info(f'fetching {url}') - html = self.fetch(url, headers=self.headers) - for proxy in self.parse(html): - logger.info(f'fetched proxy {proxy.string()} from {url}') - yield proxy - def parse(self, html): """ parse html file to get proxies @@ -49,4 +33,3 @@ def parse(self, html): crawler = XicidailiCrawler() for proxy in crawler.crawl(): print(proxy) - diff --git a/proxypool/crawlers/public/yqie.py b/proxypool/crawlers/public/yqie.py new file mode 100644 index 00000000..fb3feaf8 --- /dev/null +++ b/proxypool/crawlers/public/yqie.py @@ -0,0 +1,32 @@ +from pyquery import PyQuery as pq + +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler + +BASE_URL = "http://ip.yqie.com/ipproxy.htm" +MAX_PAGE = 1 + + +class YqIeCrawler(BaseCrawler): + """ + ip yqie crawler, http://ip.yqie.com/ipproxy.htm + """ + urls = [BASE_URL] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + trs = doc('#GridViewOrder tr:gt(0)').items() + for tr in trs: + host = tr.find('td:nth-child(1)').text() + port = int(tr.find('td:nth-child(2)').text()) + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = YqIeCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/zhandaye.py b/proxypool/crawlers/public/zhandaye.py index b6278a28..1522cdf0 100755 --- a/proxypool/crawlers/public/zhandaye.py +++ b/proxypool/crawlers/public/zhandaye.py @@ -6,7 +6,8 @@ BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html' -MAX_PAGE = 5 +MAX_PAGE = 5 * 2 + class ZhandayeCrawler(BaseCrawler): """ @@ -56,4 +57,3 @@ def parse(self, html): crawler = ZhandayeCrawler() for proxy in crawler.crawl(): print(proxy) - diff --git a/proxypool/processors/getter.py b/proxypool/processors/getter.py index 1a1d5261..c5c16296 100644 --- a/proxypool/processors/getter.py +++ b/proxypool/processors/getter.py @@ -2,13 +2,13 @@ from proxypool.storages.redis import RedisClient from proxypool.setting import PROXY_NUMBER_MAX from proxypool.crawlers import __all__ as crawlers_cls - +from proxypool.testers import __all__ as testers_cls class Getter(object): """ getter of proxypool """ - + def __init__(self): """ init db and crawlers @@ -16,14 +16,16 @@ def __init__(self): self.redis = RedisClient() self.crawlers_cls = crawlers_cls self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls] - + self.testers_cls = testers_cls + self.testers = [tester_cls() for tester_cls in self.testers_cls] + def is_full(self): """ if proxypool if full return: bool """ return self.redis.count() >= PROXY_NUMBER_MAX - + @logger.catch def run(self): """ @@ -36,6 +38,7 @@ def run(self): logger.info(f'crawler {crawler} to get proxy') for proxy in crawler.crawl(): self.redis.add(proxy) + [self.redis.add(proxy, redis_key=tester.key) for tester in self.testers] if __name__ == '__main__': diff --git a/proxypool/processors/server.py b/proxypool/processors/server.py index e87f82f5..50144590 100644 --- a/proxypool/processors/server.py +++ b/proxypool/processors/server.py @@ -1,11 +1,33 @@ -from flask import Flask, g +from flask import Flask, g, request +from proxypool.exceptions import PoolEmptyException from proxypool.storages.redis import RedisClient -from proxypool.setting import API_HOST, API_PORT, API_THREADED - +from proxypool.setting import API_HOST, API_PORT, API_THREADED, API_KEY, IS_DEV, PROXY_RAND_KEY_DEGRADED +import functools __all__ = ['app'] app = Flask(__name__) +if IS_DEV: + app.debug = True + + +def auth_required(func): + @functools.wraps(func) + def decorator(*args, **kwargs): + # conditional decorator, when setting API_KEY is set, otherwise just ignore this decorator + if API_KEY == "": + return func(*args, **kwargs) + if request.headers.get('API-KEY', None) is not None: + api_key = request.headers.get('API-KEY') + else: + return {"message": "Please provide an API key in header"}, 400 + # Check if API key is correct and valid + if request.method == "GET" and api_key == API_KEY: + return func(*args, **kwargs) + else: + return {"message": "The provided API key is not valid"}, 403 + + return decorator def get_conn(): @@ -19,6 +41,7 @@ def get_conn(): @app.route('/') +@auth_required def index(): """ get home page, you can define your own templates @@ -28,23 +51,54 @@ def index(): @app.route('/random') +@auth_required def get_proxy(): """ - get a random proxy + get a random proxy, can query the specific sub-pool according the (redis) key + if PROXY_RAND_KEY_DEGRADED is set to True, will get a universal random proxy if no proxy found in the sub-pool :return: get a random proxy """ + key = request.args.get('key') conn = get_conn() + # return conn.random(key).string() if key else conn.random().string() + if key: + try: + return conn.random(key).string() + except PoolEmptyException: + if not PROXY_RAND_KEY_DEGRADED: + raise return conn.random().string() +@app.route('/all') +@auth_required +def get_proxy_all(): + """ + get a random proxy + :return: get a random proxy + """ + key = request.args.get('key') + + conn = get_conn() + proxies = conn.all(key) if key else conn.all() + proxies_string = '' + if proxies: + for proxy in proxies: + proxies_string += str(proxy) + '\n' + + return proxies_string + + @app.route('/count') +@auth_required def get_count(): """ get the count of proxies :return: count, int """ conn = get_conn() - return str(conn.count()) + key = request.args.get('key') + return str(conn.count(key)) if key else conn.count() if __name__ == '__main__': diff --git a/proxypool/processors/tester.py b/proxypool/processors/tester.py index e0812110..6937af8c 100644 --- a/proxypool/processors/tester.py +++ b/proxypool/processors/tester.py @@ -3,10 +3,11 @@ from loguru import logger from proxypool.schemas import Proxy from proxypool.storages.redis import RedisClient -from proxypool.setting import TEST_TIMEOUT, TEST_BATCH, TEST_URL, TEST_VALID_STATUS, TEST_ANONYMOUS +from proxypool.setting import TEST_TIMEOUT, TEST_BATCH, TEST_URL, TEST_VALID_STATUS, TEST_ANONYMOUS, \ + TEST_DONT_SET_MAX_SCORE from aiohttp import ClientProxyConnectionError, ServerDisconnectedError, ClientOSError, ClientHttpProxyError from asyncio import TimeoutError - +from proxypool.testers import __all__ as testers_cls EXCEPTIONS = ( ClientProxyConnectionError, @@ -23,14 +24,16 @@ class Tester(object): """ tester for testing proxies in queue """ - + def __init__(self): """ init redis """ self.redis = RedisClient() self.loop = asyncio.get_event_loop() - + self.testers_cls = testers_cls + self.testers = [tester_cls() for tester_cls in self.testers_cls] + async def test(self, proxy: Proxy): """ test single proxy @@ -42,28 +45,69 @@ async def test(self, proxy: Proxy): logger.debug(f'testing {proxy.string()}') # if TEST_ANONYMOUS is True, make sure that # the proxy has the effect of hiding the real IP + # logger.debug(f'TEST_ANONYMOUS {TEST_ANONYMOUS}') if TEST_ANONYMOUS: url = 'https://httpbin.org/ip' async with session.get(url, timeout=TEST_TIMEOUT) as response: resp_json = await response.json() origin_ip = resp_json['origin'] + # logger.debug(f'origin ip is {origin_ip}') async with session.get(url, proxy=f'http://{proxy.string()}', timeout=TEST_TIMEOUT) as response: resp_json = await response.json() anonymous_ip = resp_json['origin'] + logger.debug(f'anonymous ip is {anonymous_ip}') assert origin_ip != anonymous_ip assert proxy.host == anonymous_ip async with session.get(TEST_URL, proxy=f'http://{proxy.string()}', timeout=TEST_TIMEOUT, allow_redirects=False) as response: if response.status in TEST_VALID_STATUS: - self.redis.max(proxy) - logger.debug(f'proxy {proxy.string()} is valid, set max score') + if TEST_DONT_SET_MAX_SCORE: + logger.debug( + f'proxy {proxy.string()} is valid, remain current score') + else: + self.redis.max(proxy) + logger.debug( + f'proxy {proxy.string()} is valid, set max score') else: self.redis.decrease(proxy) - logger.debug(f'proxy {proxy.string()} is invalid, decrease score') + logger.debug( + f'proxy {proxy.string()} is invalid, decrease score') + # if independent tester class found, create new set of storage and do the extra test + for tester in self.testers: + key = tester.key + if self.redis.exists(proxy, key): + test_url = tester.test_url + headers = tester.headers() + cookies = tester.cookies() + async with session.get(test_url, proxy=f'http://{proxy.string()}', + timeout=TEST_TIMEOUT, + headers=headers, + cookies=cookies, + allow_redirects=False) as response: + resp_text = await response.text() + is_valid = await tester.parse(resp_text, test_url, proxy.string()) + if is_valid: + if tester.test_dont_set_max_score: + logger.info( + f'key[{key}] proxy {proxy.string()} is valid, remain current score') + else: + self.redis.max( + proxy, key, tester.proxy_score_max) + logger.info( + f'key[{key}] proxy {proxy.string()} is valid, set max score') + else: + self.redis.decrease( + proxy, tester.key, tester.proxy_score_min) + logger.info( + f'key[{key}] proxy {proxy.string()} is invalid, decrease score') + except EXCEPTIONS: self.redis.decrease(proxy) - logger.debug(f'proxy {proxy.string()} is invalid, decrease score') - + [self.redis.decrease(proxy, tester.key, tester.proxy_score_min) + for tester in self.testers] + logger.debug( + f'proxy {proxy.string()} is invalid, decrease score') + @logger.catch def run(self): """ @@ -76,15 +120,25 @@ def run(self): logger.debug(f'{count} proxies to test') cursor = 0 while True: - logger.debug(f'testing proxies use cursor {cursor}, count {TEST_BATCH}') + logger.debug( + f'testing proxies use cursor {cursor}, count {TEST_BATCH}') cursor, proxies = self.redis.batch(cursor, count=TEST_BATCH) if proxies: - tasks = [self.test(proxy) for proxy in proxies] + tasks = [self.loop.create_task( + self.test(proxy)) for proxy in proxies] self.loop.run_until_complete(asyncio.wait(tasks)) if not cursor: break +def run_tester(): + host = '96.113.165.182' + port = '3128' + tasks = [tester.test(Proxy(host=host, port=port))] + tester.loop.run_until_complete(asyncio.wait(tasks)) + + if __name__ == '__main__': tester = Tester() tester.run() + # run_tester() diff --git a/proxypool/scheduler.py b/proxypool/scheduler.py index bec0c595..a2d18abe 100644 --- a/proxypool/scheduler.py +++ b/proxypool/scheduler.py @@ -3,7 +3,8 @@ from proxypool.processors.server import app from proxypool.processors.getter import Getter from proxypool.processors.tester import Tester -from proxypool.setting import CYCLE_GETTER, CYCLE_TESTER, API_HOST, API_THREADED, API_PORT, ENABLE_SERVER, \ +from proxypool.setting import APP_PROD_METHOD_GEVENT, APP_PROD_METHOD_MEINHELD, APP_PROD_METHOD_TORNADO, CYCLE_GETTER, CYCLE_TESTER, API_HOST, \ + API_THREADED, API_PORT, ENABLE_SERVER, IS_PROD, APP_PROD_METHOD, \ ENABLE_GETTER, ENABLE_TESTER, IS_WINDOWS from loguru import logger @@ -18,7 +19,7 @@ class Scheduler(): """ scheduler """ - + def run_tester(self, cycle=CYCLE_TESTER): """ run tester @@ -33,7 +34,7 @@ def run_tester(self, cycle=CYCLE_TESTER): tester.run() loop += 1 time.sleep(cycle) - + def run_getter(self, cycle=CYCLE_GETTER): """ run getter @@ -48,7 +49,7 @@ def run_getter(self, cycle=CYCLE_GETTER): getter.run() loop += 1 time.sleep(cycle) - + def run_server(self): """ run server for api @@ -56,43 +57,84 @@ def run_server(self): if not ENABLE_SERVER: logger.info('server not enabled, exit') return - app.run(host=API_HOST, port=API_PORT, threaded=API_THREADED) - + if IS_PROD: + if APP_PROD_METHOD == APP_PROD_METHOD_GEVENT: + try: + from gevent.pywsgi import WSGIServer + except ImportError as e: + logger.exception(e) + else: + http_server = WSGIServer((API_HOST, API_PORT), app) + http_server.serve_forever() + + elif APP_PROD_METHOD == APP_PROD_METHOD_TORNADO: + try: + from tornado.wsgi import WSGIContainer + from tornado.httpserver import HTTPServer + from tornado.ioloop import IOLoop + except ImportError as e: + logger.exception(e) + else: + http_server = HTTPServer(WSGIContainer(app)) + http_server.listen(API_PORT) + IOLoop.instance().start() + + elif APP_PROD_METHOD == APP_PROD_METHOD_MEINHELD: + try: + import meinheld + except ImportError as e: + logger.exception(e) + else: + meinheld.listen((API_HOST, API_PORT)) + meinheld.run(app) + + else: + logger.error("unsupported APP_PROD_METHOD") + return + else: + app.run(host=API_HOST, port=API_PORT, threaded=API_THREADED, use_reloader=False) + def run(self): global tester_process, getter_process, server_process try: logger.info('starting proxypool...') if ENABLE_TESTER: - tester_process = multiprocessing.Process(target=self.run_tester) + tester_process = multiprocessing.Process( + target=self.run_tester) logger.info(f'starting tester, pid {tester_process.pid}...') tester_process.start() - + if ENABLE_GETTER: - getter_process = multiprocessing.Process(target=self.run_getter) - logger.info(f'starting getter, pid{getter_process.pid}...') + getter_process = multiprocessing.Process( + target=self.run_getter) + logger.info(f'starting getter, pid {getter_process.pid}...') getter_process.start() - + if ENABLE_SERVER: - server_process = multiprocessing.Process(target=self.run_server) - logger.info(f'starting server, pid{server_process.pid}...') + server_process = multiprocessing.Process( + target=self.run_server) + logger.info(f'starting server, pid {server_process.pid}...') server_process.start() - - tester_process.join() - getter_process.join() - server_process.join() + + tester_process and tester_process.join() + getter_process and getter_process.join() + server_process and server_process.join() except KeyboardInterrupt: logger.info('received keyboard interrupt signal') - tester_process.terminate() - getter_process.terminate() - server_process.terminate() + tester_process and tester_process.terminate() + getter_process and getter_process.terminate() + server_process and server_process.terminate() finally: # must call join method before calling is_alive - tester_process.join() - getter_process.join() - server_process.join() - logger.info(f'tester is {"alive" if tester_process.is_alive() else "dead"}') - logger.info(f'getter is {"alive" if getter_process.is_alive() else "dead"}') - logger.info(f'server is {"alive" if server_process.is_alive() else "dead"}') + tester_process and tester_process.join() + getter_process and getter_process.join() + server_process and server_process.join() + logger.info( + f'tester is {"alive" if tester_process.is_alive() else "dead"}') + logger.info( + f'getter is {"alive" if getter_process.is_alive() else "dead"}') + logger.info( + f'server is {"alive" if server_process.is_alive() else "dead"}') logger.info('proxy terminated') diff --git a/proxypool/setting.py b/proxypool/setting.py index 7c3008b3..a445667e 100644 --- a/proxypool/setting.py +++ b/proxypool/setting.py @@ -2,7 +2,7 @@ from os.path import dirname, abspath, join from environs import Env from loguru import logger -from proxypool.utils.parse import parse_redis_connection_string +import shutil env = Env() @@ -23,27 +23,41 @@ APP_PROD = IS_PROD = APP_ENV == PROD_MODE APP_TEST = IS_TEST = APP_ENV == TEST_MODE + +# Which WSGI container is used to run applications +# - gevent: pip install gevent +# - tornado: pip install tornado +# - meinheld: pip install meinheld +APP_PROD_METHOD_GEVENT = 'gevent' +APP_PROD_METHOD_TORNADO = 'tornado' +APP_PROD_METHOD_MEINHELD = 'meinheld' +APP_PROD_METHOD = env.str('APP_PROD_METHOD', APP_PROD_METHOD_GEVENT).lower() + # redis host -REDIS_HOST = env.str('REDIS_HOST', '127.0.0.1') +REDIS_HOST = env.str('PROXYPOOL_REDIS_HOST', + env.str('REDIS_HOST', '127.0.0.1')) # redis port -REDIS_PORT = env.int('REDIS_PORT', 6379) +REDIS_PORT = env.int('PROXYPOOL_REDIS_PORT', env.int('REDIS_PORT', 6379)) # redis password, if no password, set it to None -REDIS_PASSWORD = env.str('REDIS_PASSWORD', None) +REDIS_PASSWORD = env.str('PROXYPOOL_REDIS_PASSWORD', + env.str('REDIS_PASSWORD', None)) # redis db, if no choice, set it to 0 -REDIS_DB = env.int('REDIS_DB', 0) -# redis connection string, like redis://[password]@host:port or rediss://[password]@host:port/0 -REDIS_CONNECTION_STRING = env.str('REDIS_CONNECTION_STRING', None) - -if REDIS_CONNECTION_STRING: - REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_DB = parse_redis_connection_string(REDIS_CONNECTION_STRING) +REDIS_DB = env.int('PROXYPOOL_REDIS_DB', env.int('REDIS_DB', 0)) +# redis connection string, like redis://[password]@host:port or rediss://[password]@host:port/0, +# please refer to https://redis-py.readthedocs.io/en/stable/connections.html#redis.client.Redis.from_url +REDIS_CONNECTION_STRING = env.str( + 'PROXYPOOL_REDIS_CONNECTION_STRING', env.str('REDIS_CONNECTION_STRING', None)) # redis hash table key name -REDIS_KEY = env.str('REDIS_KEY', 'proxies:universal') +REDIS_KEY = env.str('PROXYPOOL_REDIS_KEY', env.str( + 'REDIS_KEY', 'proxies:universal')) # definition of proxy scores -PROXY_SCORE_MAX = 100 -PROXY_SCORE_MIN = 0 -PROXY_SCORE_INIT = 10 +PROXY_SCORE_MAX = env.int('PROXY_SCORE_MAX', 100) +PROXY_SCORE_MIN = env.int('PROXY_SCORE_MIN', 0) +PROXY_SCORE_INIT = env.int('PROXY_SCORE_INIT', 10) +# whether to get a universal random proxy if no proxy exists in the sub-pool identified by a specific key +PROXY_RAND_KEY_DEGRADED = env.bool('TEST_ANONYMOUS', True) # definition of proxy number PROXY_NUMBER_MAX = 50000 @@ -60,22 +74,50 @@ TEST_TIMEOUT = env.int('TEST_TIMEOUT', 10) TEST_BATCH = env.int('TEST_BATCH', 20) # only save anonymous proxy -TEST_ANONYMOUS = True +TEST_ANONYMOUS = env.bool('TEST_ANONYMOUS', True) # TEST_HEADERS = env.json('TEST_HEADERS', { # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', # }) TEST_VALID_STATUS = env.list('TEST_VALID_STATUS', [200, 206, 302]) +# whether to set max score when one proxy is tested valid +TEST_DONT_SET_MAX_SCORE = env.bool('TEST_DONT_SET_MAX_SCORE', False) # definition of api API_HOST = env.str('API_HOST', '0.0.0.0') API_PORT = env.int('API_PORT', 5555) API_THREADED = env.bool('API_THREADED', True) +# add an api key to get proxy +# need a header of `API-KEY` in get request to pass the authenticate +# API_KEY='', do not need `API-KEY` header +API_KEY = env.str('API_KEY', '') # flags of enable ENABLE_TESTER = env.bool('ENABLE_TESTER', True) ENABLE_GETTER = env.bool('ENABLE_GETTER', True) ENABLE_SERVER = env.bool('ENABLE_SERVER', True) -logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), level='DEBUG', rotation='1 week', retention='20 days') -logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), level='ERROR', rotation='1 week') +ENABLE_LOG_FILE = env.bool('ENABLE_LOG_FILE', True) +ENABLE_LOG_RUNTIME_FILE = env.bool('ENABLE_LOG_RUNTIME_FILE', True) +ENABLE_LOG_ERROR_FILE = env.bool('ENABLE_LOG_ERROR_FILE', True) + + +LOG_LEVEL_MAP = { + DEV_MODE: "DEBUG", + TEST_MODE: "INFO", + PROD_MODE: "ERROR" +} + +LOG_LEVEL = LOG_LEVEL_MAP.get(APP_ENV) +LOG_ROTATION = env.str('LOG_ROTATION', '500MB') +LOG_RETENTION = env.str('LOG_RETENTION', '1 week') + +if ENABLE_LOG_FILE: + if ENABLE_LOG_RUNTIME_FILE: + logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), + level=LOG_LEVEL, rotation=LOG_ROTATION, retention=LOG_RETENTION) + if ENABLE_LOG_ERROR_FILE: + logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), + level='ERROR', rotation=LOG_ROTATION) +else: + shutil.rmtree(LOG_DIR, ignore_errors=True) diff --git a/proxypool/storages/redis.py b/proxypool/storages/redis.py index 60a03e9f..2d052323 100644 --- a/proxypool/storages/redis.py +++ b/proxypool/storages/redis.py @@ -1,7 +1,7 @@ import redis from proxypool.exceptions import PoolEmptyException from proxypool.schemas.proxy import Proxy -from proxypool.setting import REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_DB, REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MIN, \ +from proxypool.setting import REDIS_CONNECTION_STRING, REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_DB, REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MIN, \ PROXY_SCORE_INIT from random import choice from typing import List @@ -18,16 +18,23 @@ class RedisClient(object): redis connection client of proxypool """ - def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=REDIS_DB, **kwargs): + def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=REDIS_DB, + connection_string=REDIS_CONNECTION_STRING, **kwargs): """ init redis client :param host: redis host :param port: redis port :param password: redis password + :param connection_string: redis connection_string """ - self.db = redis.StrictRedis(host=host, port=port, password=password, db=db, decode_responses=True, **kwargs) + # if set connection_string, just use it + if connection_string: + self.db = redis.StrictRedis.from_url(connection_string, decode_responses=True, **kwargs) + else: + self.db = redis.StrictRedis( + host=host, port=port, password=password, db=db, decode_responses=True, **kwargs) - def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int: + def add(self, proxy: Proxy, score=PROXY_SCORE_INIT, redis_key=REDIS_KEY) -> int: """ add proxy and set it to init score :param proxy: proxy, ip:port, like 8.8.8.8:88 @@ -37,12 +44,12 @@ def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int: if not is_valid_proxy(f'{proxy.host}:{proxy.port}'): logger.info(f'invalid proxy {proxy}, throw it') return - if not self.exists(proxy): + if not self.exists(proxy, redis_key): if IS_REDIS_VERSION_2: - return self.db.zadd(REDIS_KEY, score, proxy.string()) - return self.db.zadd(REDIS_KEY, {proxy.string(): score}) + return self.db.zadd(redis_key, score, proxy.string()) + return self.db.zadd(redis_key, {proxy.string(): score}) - def random(self) -> Proxy: + def random(self, redis_key=REDIS_KEY, proxy_score_min=PROXY_SCORE_MIN, proxy_score_max=PROXY_SCORE_MAX) -> Proxy: """ get random proxy firstly try to get proxy with max score @@ -51,73 +58,75 @@ def random(self) -> Proxy: :return: proxy, like 8.8.8.8:8 """ # try to get proxy with max score - proxies = self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MAX) + proxies = self.db.zrangebyscore( + redis_key, proxy_score_max, proxy_score_max) if len(proxies): return convert_proxy_or_proxies(choice(proxies)) # else get proxy by rank - proxies = self.db.zrevrange(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX) + proxies = self.db.zrevrange( + redis_key, proxy_score_min, proxy_score_max) if len(proxies): return convert_proxy_or_proxies(choice(proxies)) # else raise error raise PoolEmptyException - def decrease(self, proxy: Proxy) -> int: + def decrease(self, proxy: Proxy, redis_key=REDIS_KEY, proxy_score_min=PROXY_SCORE_MIN) -> int: """ decrease score of proxy, if small than PROXY_SCORE_MIN, delete it :param proxy: proxy :return: new score """ if IS_REDIS_VERSION_2: - self.db.zincrby(REDIS_KEY, proxy.string(), -1) + self.db.zincrby(redis_key, proxy.string(), -1) else: - self.db.zincrby(REDIS_KEY, -1, proxy.string()) - score = self.db.zscore(REDIS_KEY, proxy.string()) + self.db.zincrby(redis_key, -1, proxy.string()) + score = self.db.zscore(redis_key, proxy.string()) logger.info(f'{proxy.string()} score decrease 1, current {score}') - if score <= PROXY_SCORE_MIN: + if score <= proxy_score_min: logger.info(f'{proxy.string()} current score {score}, remove') - self.db.zrem(REDIS_KEY, proxy.string()) + self.db.zrem(redis_key, proxy.string()) - def exists(self, proxy: Proxy) -> bool: + def exists(self, proxy: Proxy, redis_key=REDIS_KEY) -> bool: """ if proxy exists :param proxy: proxy :return: if exists, bool """ - return not self.db.zscore(REDIS_KEY, proxy.string()) is None + return not self.db.zscore(redis_key, proxy.string()) is None - def max(self, proxy: Proxy) -> int: + def max(self, proxy: Proxy, redis_key=REDIS_KEY, proxy_score_max=PROXY_SCORE_MAX) -> int: """ set proxy to max score :param proxy: proxy :return: new score """ - logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}') + logger.info(f'{proxy.string()} is valid, set to {proxy_score_max}') if IS_REDIS_VERSION_2: - return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string()) - return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX}) + return self.db.zadd(redis_key, proxy_score_max, proxy.string()) + return self.db.zadd(redis_key, {proxy.string(): proxy_score_max}) - def count(self) -> int: + def count(self, redis_key=REDIS_KEY) -> int: """ get count of proxies :return: count, int """ - return self.db.zcard(REDIS_KEY) + return self.db.zcard(redis_key) - def all(self) -> List[Proxy]: + def all(self, redis_key=REDIS_KEY, proxy_score_min=PROXY_SCORE_MIN, proxy_score_max=PROXY_SCORE_MAX) -> List[Proxy]: """ get all proxies :return: list of proxies """ - return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX)) + return convert_proxy_or_proxies(self.db.zrangebyscore(redis_key, proxy_score_min, proxy_score_max)) - def batch(self, cursor, count) -> List[Proxy]: + def batch(self, cursor, count, redis_key=REDIS_KEY) -> List[Proxy]: """ get batch of proxies :param cursor: scan cursor :param count: scan count :return: list of proxies """ - cursor, proxies = self.db.zscan(REDIS_KEY, cursor, count=count) + cursor, proxies = self.db.zscan(redis_key, cursor, count=count) return cursor, convert_proxy_or_proxies([i[0] for i in proxies]) @@ -125,4 +134,3 @@ def batch(self, cursor, count) -> List[Proxy]: conn = RedisClient() result = conn.random() print(result) - diff --git a/proxypool/testers/__init__.py b/proxypool/testers/__init__.py new file mode 100644 index 00000000..4e4df95e --- /dev/null +++ b/proxypool/testers/__init__.py @@ -0,0 +1,16 @@ +import pkgutil +from .base import BaseTester +import inspect + + +# load classes subclass of BaseCrawler +classes = [] +for loader, name, is_pkg in pkgutil.walk_packages(__path__): + module = loader.find_module(name).load_module(name) + for name, value in inspect.getmembers(module): + globals()[name] = value + if inspect.isclass(value) and issubclass(value, BaseTester) and value is not BaseTester \ + and not getattr(value, 'ignore', False): + classes.append(value) +__all__ = __ALL__ = classes + diff --git a/proxypool/testers/base.py b/proxypool/testers/base.py new file mode 100644 index 00000000..796b7cfc --- /dev/null +++ b/proxypool/testers/base.py @@ -0,0 +1,19 @@ +from proxypool.setting import TEST_DONT_SET_MAX_SCORE, PROXY_SCORE_INIT, PROXY_SCORE_MAX, PROXY_SCORE_MIN + + +class BaseTester(object): + test_url = "" + key = "" + test_dont_set_max_score = TEST_DONT_SET_MAX_SCORE + proxy_score_init = PROXY_SCORE_INIT + proxy_score_max = PROXY_SCORE_MAX + proxy_score_min = PROXY_SCORE_MIN + + def headers(self): + return None + + def cookies(self): + return None + + async def parse(self, html, url, proxy, expr='{"code":0'): + return True if expr in html else False diff --git a/proxypool/utils/parse.py b/proxypool/utils/parse.py deleted file mode 100644 index b3f42f5f..00000000 --- a/proxypool/utils/parse.py +++ /dev/null @@ -1,13 +0,0 @@ -import re - -def parse_redis_connection_string(connection_string): - """ - parse a redis connection string, for example: - redis://[password]@host:port - rediss://[password]@host:port - :param connection_string: - :return: - """ - result = re.match('rediss?:\/\/(.*?)@(.*?):(\d+)\/(\d+)', connection_string) - return result.group(2), int(result.group(3)), (result.group(1) or None), (result.group(4) or 0) if result \ - else ('localhost', 6379, None) diff --git a/proxypool/utils/proxy.py b/proxypool/utils/proxy.py index b22e05a4..5330ceb4 100644 --- a/proxypool/utils/proxy.py +++ b/proxypool/utils/proxy.py @@ -2,7 +2,13 @@ def is_valid_proxy(data): - if data.__contains__(':'): + """ + check this string is within proxy format + """ + if is_auth_proxy(data): + host, port = extract_auth_proxy(data) + return is_ip_valid(host) and is_port_valid(port) + elif data.__contains__(':'): ip = data.split(':')[0] port = data.split(':')[1] return is_ip_valid(ip) and is_port_valid(port) @@ -11,6 +17,11 @@ def is_valid_proxy(data): def is_ip_valid(ip): + """ + check this string is within ip format + """ + if is_auth_proxy(ip): + ip = ip.split('@')[1] a = ip.split('.') if len(a) != 4: return False @@ -42,9 +53,36 @@ def convert_proxy_or_proxies(data): # skip invalid item item = item.strip() if not is_valid_proxy(item): continue - host, port = item.split(':') + if is_auth_proxy(item): + host, port = extract_auth_proxy(item) + else: + host, port, *_ = item.split(':') result.append(Proxy(host=host, port=int(port))) return result if isinstance(data, str) and is_valid_proxy(data): - host, port = data.split(':') + if is_auth_proxy(data): + host, port = extract_auth_proxy(data) + else: + host, port = data.split(':') return Proxy(host=host, port=int(port)) + + +def is_auth_proxy(data: str) -> bool: + return '@' in data + + +def extract_auth_proxy(data: str) -> (str, str): + """ + extract host and port from a proxy with authentication + """ + auth = data.split('@')[0] + ip_port = data.split('@')[1] + ip = ip_port.split(':')[0] + port = ip_port.split(':')[1] + host = auth + '@' + ip + return host, port + + +if __name__ == '__main__': + proxy = 'test1234:test5678.@117.68.216.212:32425' + print(extract_auth_proxy(proxy)) diff --git a/release.sh b/release.sh new file mode 100755 index 00000000..342cd06f --- /dev/null +++ b/release.sh @@ -0,0 +1,2 @@ +git tag -a "`date +'%Y%m%d'`" -m "Release `date +'%Y%m%d'`" +git push origin --tags \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index cc0b6111..33f35c50 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,17 @@ -environs==7.2.0 -Flask==1.0.3 -attrs==19.1.0 -retrying==1.3.3 -aiohttp==3.6.2 -requests==2.22.0 -loguru==0.3.2 -pyquery==1.4.0 -supervisor==4.1.0 -redis==2.10.6 -lxml==4.3.3 \ No newline at end of file +environs>=9.3.0,<10.0.0 +Flask>=1.1.2,<2.0.0 +attrs>=20.3.0,<21.0.0 +retrying>=1.3.3,<2.0.0 +aiohttp>=3.8.1,<4.0.0 +requests>=2.25.1,<3.0.0 +loguru>=0.5.3,<1.0.0 +pyquery>=1.4.3,<2.0.0 +supervisor>=4.2.1,<5.0.0 +redis>=3.5.3,<4.0.0 +lxml>=4.6.5,<5.0.0 +fake_headers>=1.0.2,<2.0.0 +maxminddb_geolite2==2018.703 +gevent>=21.8.0,<24.0.0 +tornado>=6.0,<7.0 +itsdangerous==0.24 +MarkupSafe<2.1.0 diff --git a/supervisord.conf b/supervisord.conf index c5828e0c..aff2cd64 100644 --- a/supervisord.conf +++ b/supervisord.conf @@ -1,17 +1,40 @@ +[unix_http_server] +file=/run/supervisor.sock +chmod=0700 + [supervisord] +pidfile=/run/supervisord.pid nodaemon=true +[supervisorctl] +serverurl=unix:///run/supervisor.sock + +[rpcinterface:supervisor] +supervisor.rpcinterface_factory=supervisor.rpcinterface:make_main_rpcinterface + [program:tester] process_name=tester command=python3 run.py --processor tester directory=/app +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 [program:getter] process_name=getter command=python3 run.py --processor getter directory=/app +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 [program:server] process_name=server command=python3 run.py --processor server -directory=/app \ No newline at end of file +directory=/app +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 \ No newline at end of file