Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
add GitHub spider 异步
- Loading branch information
1 parent
0146704
commit d70aa9c
Showing
10 changed files
with
833 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
#!/usr/bin/env python | ||
# -*- coding:utf-8 -*- | ||
# | ||
# Author : XueWeiHan | ||
# Date : 17/8/30 下午5:58 | ||
# Desc : base spider | ||
import os | ||
import logging | ||
|
||
import requests | ||
|
||
|
||
class BaseSpider(object): | ||
spider_name = 'base' | ||
|
||
def __init__(self): | ||
logging.getLogger("requests").setLevel(logging.WARNING) | ||
logging.getLogger("urllib3").setLevel(logging.WARNING) | ||
logging.basicConfig( | ||
level=logging.INFO, | ||
filename=os.path.join(os.path.dirname(__file__), | ||
'{name}.txt'.format(name=self.spider_name)), | ||
filemode='a', | ||
format='%(name)s %(asctime)s %(filename)s[line:%(lineno)d] ' | ||
'%(levelname)s %(message)s') | ||
self.logger = logging.getLogger(self.spider_name) # 设置log名称 | ||
|
||
def get_data(self, url): | ||
try: | ||
response = requests.get(url, timeout=20) | ||
return response | ||
except Exception as e: | ||
self.logger.error(u"获取 {url} 数据失败:{e}".format(url=url, e=e)) | ||
return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
#!/usr/bin/env python | ||
# -*- coding:utf-8 -*- | ||
# | ||
# Author : XueWeiHan | ||
# Date : 17/8/30 下午4:40 | ||
# Desc : 配置 | ||
from huey import RedisHuey | ||
|
||
huey = RedisHuey() | ||
|
||
DATABASE_URL = 'mysql://root:@127.0.0.1:3306/github?charset=utf8mb4' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#!/usr/bin/env python | ||
# -*- coding:utf-8 -*- | ||
# | ||
# Author : XueWeiHan | ||
# Date : 17/8/30 下午5:33 | ||
# Desc : Model | ||
from datetime import datetime, date | ||
|
||
from peewee import Model | ||
from playhouse.db_url import connect | ||
from peewee import CharField, TextField, DateTimeField, IntegerField, DateField | ||
|
||
from config import DATABASE_URL | ||
|
||
database = connect(DATABASE_URL) | ||
|
||
|
||
class BaseModel(Model): | ||
class Meta: | ||
database = database | ||
|
||
|
||
class User(BaseModel): | ||
uuid = CharField(max_length=150) | ||
name = CharField(max_length=255) | ||
nickname = CharField(max_length=255) | ||
avatar_url = CharField(max_length=255) | ||
html_url = CharField(max_length=255) | ||
public_repos = IntegerField() | ||
followers = IntegerField() | ||
stars_count = IntegerField(default=0) | ||
location = CharField(max_length=255) | ||
email = CharField(max_length=255, null=True) | ||
create_time = DateTimeField(default=datetime.now) | ||
update_time = DateTimeField(default=datetime.now) | ||
|
||
|
||
class Proxy(BaseModel): | ||
url = CharField(max_length=150, unique=True) | ||
status = IntegerField(default=1) | ||
create_time = DateTimeField(default=datetime.now) | ||
update_time = DateTimeField(default=datetime.now) | ||
reset_time = DateTimeField(null=True, default=None) | ||
|
||
database.create_tables([User, Proxy], safe=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,219 @@ | ||
#!/usr/bin/env python | ||
# -*- coding:utf-8 -*- | ||
# | ||
# Author : XueWeiHan | ||
# Date : 17/7/19 下午3:19 | ||
# Desc : github 爬虫 | ||
# API https://developer.github.com/v3/ | ||
# User https://api.github.com/users/521xueweihan | ||
# | ||
from gevent import monkey | ||
monkey.patch_all() | ||
|
||
|
||
import re | ||
import time | ||
from gevent.pool import Pool | ||
from functools import wraps | ||
from datetime import datetime | ||
|
||
import redis | ||
import requests | ||
from requests.exceptions import ProxyError, ConnectionError, Timeout | ||
from peewee import DoesNotExist, IntegrityError | ||
|
||
from github_model import User, Proxy | ||
|
||
# for debug to disable insecureWarning | ||
requests.packages.urllib3.disable_warnings() | ||
|
||
conn = redis.Redis(host='127.0.0.1', port=6379) | ||
|
||
class LimitError(Exception): | ||
pass | ||
|
||
|
||
def save_users_info(info): | ||
pass | ||
|
||
|
||
def save_users(ids, names): | ||
conn.rpush('user.id', *ids) | ||
conn.rpush('user.name', *names) | ||
|
||
|
||
def get_all_user_id(): | ||
return conn.lrange('user.id', 0, conn.llen('user.id')) | ||
|
||
|
||
def get_all_user_name(): | ||
return conn.lrange('user.name', 0, conn.llen('user.name')) | ||
|
||
|
||
def fetch_proxies(): | ||
with open('/Users/xueweihan/Documents/github_project/proxyspider/proxy_list.txt', 'r+') as fb: | ||
for line in fb.readlines()[1:]: | ||
try: | ||
Proxy.create(url=line.split(' ')[0]) | ||
except IntegrityError: | ||
continue | ||
|
||
|
||
def get_proxy(): | ||
try: | ||
proxy_obj = Proxy.get(Proxy.status == 1, | ||
(Proxy.reset_time.is_null()) | | ||
(Proxy.reset_time < datetime.now())) | ||
proxy_url = proxy_obj.url | ||
except DoesNotExist: | ||
proxy_url = None | ||
return proxy_url | ||
|
||
|
||
def make_params(url, proxy_url): | ||
params = { | ||
'url': url, | ||
'verify': False, | ||
'timeout': 20, | ||
} | ||
if not check_per_page(url): | ||
params['params'] = {'per_page': 100} | ||
if not proxy_url: | ||
return params | ||
proxies_dict = {} | ||
proxy_url = "http://{}".format(proxy_url) | ||
proxies_dict['http'] = proxy_url | ||
proxies_dict['https'] = proxy_url | ||
params['proxies'] = proxies_dict | ||
return params | ||
|
||
|
||
def proxy(fn): | ||
@wraps(fn) | ||
def wrap(*args): | ||
while 1: | ||
proxy_url = get_proxy() | ||
try: | ||
return fn(*args, proxy_url=proxy_url) | ||
except (ProxyError, Timeout, ConnectionError): | ||
print 'ProxyError' | ||
Proxy.update(status=0, update_time=datetime.now()).where(Proxy.url == proxy_url).execute() | ||
except LimitError as e: | ||
Proxy.update(reset_time=datetime.fromtimestamp((int(time.time()) + 60*60)), update_time=datetime.now()).where(Proxy.url == proxy_url).execute() | ||
except Exception as e: | ||
print 'unknow error' | ||
Proxy.update(status=0, update_time=datetime.now()).where(Proxy.url == proxy_url).execute() | ||
|
||
return wrap | ||
|
||
|
||
@proxy | ||
def get_data(url, proxy_url=''): | ||
params = make_params(url, proxy_url) | ||
response = requests.get(**params) | ||
print proxy_url, response.status_code, response.url | ||
if response.status_code == 403: | ||
reset_datetime = check_limit(response.headers) | ||
if reset_datetime: | ||
if proxy_url is None: | ||
sleep_seconds = (reset_datetime - datetime.now()).total_seconds() | ||
print 'sleep:', sleep_seconds | ||
time.sleep(sleep_seconds) | ||
else: | ||
print 'reset_second: ', reset_datetime | ||
Proxy.update(reset_time=reset_datetime, | ||
update_time=datetime.now())\ | ||
.where(Proxy.url == proxy_url).execute() | ||
raise LimitError | ||
elif response.status_code == 200: | ||
return response | ||
|
||
|
||
def check_per_page(url): | ||
urlparse_obj = requests.utils.urlparse(url) | ||
if 'per_page' in urlparse_obj.query: | ||
return True | ||
else: | ||
return False | ||
|
||
|
||
def check_limit(headers): | ||
""" | ||
:return: datetime | ||
""" | ||
remaining = int(headers.get('X-RateLimit-Remaining')) | ||
if not remaining: | ||
reset_time = datetime.fromtimestamp(int(headers.get('X-RateLimit-Reset'))) | ||
return reset_time | ||
return None | ||
|
||
|
||
def next_page_url(headers): | ||
link_params = headers.get('Link') | ||
pattern = re.compile( | ||
r'<https://api\.github\.com/.*>; rel="next"') | ||
if not link_params: | ||
return None | ||
else: | ||
s = re.search(pattern, link_params) | ||
if not s: | ||
return None | ||
else: | ||
return s.group().split(';')[0][1:-1] | ||
|
||
|
||
def fetch_user(url): | ||
response = get_data(url) | ||
|
||
ids = [] | ||
names = [] | ||
for item in response.json()['items']: | ||
ids.append(item.get('id')) | ||
names.append(item.get('login')) | ||
save_users(ids, names) | ||
return next_page_url(response.headers) | ||
|
||
|
||
def fetch_all_user(): | ||
url = 'https://api.github.com/search/users?q=location:china&sort=followers' | ||
next_url = fetch_user(url) | ||
while next_url: | ||
next_url = fetch_user(next_url) | ||
|
||
|
||
def fetch_user_info(name): | ||
url = 'https://api.github.com/users/{}'.format(name) | ||
response = get_data(url) | ||
json_data = response.json() | ||
|
||
result_dict = { | ||
'uuid': json_data.get('id'), | ||
'name': json_data.get('login'), | ||
'nickname': json_data.get('name') or json_data.get('login'), | ||
'avatar_url': json_data.get('avatar_url'), | ||
'html_url': json_data.get('html_url'), | ||
'public_repos': int(json_data.get('public_repos', 0)), | ||
'followers': int(json_data.get('followers', 0)), | ||
'location': json_data.get('location'), | ||
'email': json_data.get('email'), | ||
} | ||
User.get_or_create(**result_dict) | ||
|
||
|
||
|
||
# fetch_proxies() | ||
# fetch_all_user() | ||
# | ||
|
||
# all user | ||
all_users_name = get_all_user_name() | ||
pool = Pool(20) | ||
|
||
username_list = [] | ||
for user_name in all_users_name: | ||
try: | ||
User.get(User.name==user_name) | ||
except DoesNotExist: | ||
username_list.append(user_name) | ||
|
||
pool.map(fetch_user_info, username_list) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#!/usr/bin/env python | ||
# -*- coding:utf-8 -*- | ||
# | ||
# Author : XueWeiHan | ||
# Date : 17/8/30 下午4:48 | ||
# Desc : 入口 | ||
|
||
# main.py | ||
from config import huey # import our "huey" object | ||
from tasks import sub # import our task | ||
|
||
|
||
if __name__ == '__main__': | ||
pass |
Oops, something went wrong.