Skip to content

Commit 9074864

Browse files
committed
add connotation video crawled
1 parent 3f6c58e commit 9074864

File tree

2 files changed

+120
-4
lines changed

2 files changed

+120
-4
lines changed

README.md

Lines changed: 0 additions & 4 deletions
This file was deleted.

妹纸图/connotation_video.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
# coding=utf-8
2+
import os
3+
import time
4+
import requests
5+
import threading
6+
from lxml import etree
7+
from selenium import webdriver
8+
from contextlib import closing
9+
from selenium.webdriver.common.by import By
10+
from selenium.webdriver.support.ui import WebDriverWait
11+
from selenium.webdriver.support import expected_conditions as EC
12+
13+
14+
class VideoDown():
15+
16+
def __init__(self):
17+
self.first_position = 0
18+
self.count = 0
19+
self.video_path = 'D:/Photo/VD/'
20+
self.threads = []
21+
self.content = []
22+
self.check_file()
23+
24+
def check_file(self):
25+
if not os.path.exists(self.video_path):
26+
os.makedirs(self.video_path)
27+
28+
def load_data(self):
29+
video_url = "http://neihanshequ.com/video/"
30+
driver = webdriver.Firefox() # 获取浏览器驱动
31+
driver.maximize_window()
32+
driver.implicitly_wait(10) # 隐式等待方法一定程度上节省了很多时间
33+
driver.get(video_url)
34+
35+
while True:
36+
try:
37+
# WebDriverWait(driver, 10).until(
38+
# lambda x: x.find_element_by_id('loadMore'))
39+
# or 通过定位器来定位元素
40+
WebDriverWait(driver, 10).until(
41+
EC.visibility_of_element_located((By.ID, 'loadMore')))
42+
except Exception as e:
43+
print("WebDriverWait Error : ", str(e))
44+
break
45+
js='window.scrollTo(0,document.body.scrollHeight)'
46+
driver.execute_script(js)
47+
48+
# 等待10秒,让浏览器加载
49+
time.sleep(10)
50+
51+
source=etree.HTML(driver.page_source)
52+
divs=source.xpath('//*[@id="detail-list"]/li')
53+
for div in divs:
54+
self.count += 1
55+
print('第%d条数据' % self.count)
56+
title=div.xpath('./div/div[2]/a/div/p/text()') # 这里xpath获取div/h1/p获取不到
57+
v_url=div.xpath('.//div[@class="player-container"]/@data-src')
58+
title=title[0] if len(title) > 0 else '无介绍'
59+
v_url=v_url[0] if len(v_url) > 0 else ""
60+
61+
self.do_thread(title, v_url)
62+
63+
try:
64+
load_more=WebDriverWait(driver, 10).until(
65+
lambda x: x.find_element_by_id('loadMore'))
66+
load_more.click()
67+
time.sleep(10)
68+
except Exception as e:
69+
print("load more error : ", str(e))
70+
71+
def do_thread(self, title, url):
72+
t=threading.Thread(target=self.down_video, args=(title, url))
73+
self.threads.append(t)
74+
t.start()
75+
76+
for tt in self.threads:
77+
tt.join()
78+
79+
def down_video(self, title, url):
80+
try:
81+
# 拿到原始返回数据
82+
with closing(requests.get(url, stream=True)) as response:
83+
print(url)
84+
chunk_size=1024
85+
content_size=int(response.headers['content-length'])
86+
87+
file_name=self.video_path + '{}.mp4'.format(title)
88+
if os.path.exists(file_name) and os.path.getsize(file_name) == content_size:
89+
print('跳过 ' + file_name)
90+
else:
91+
down=DownProgress(title, content_size)
92+
with open(file_name, 'wb') as f:
93+
for data in response.iter_content(chunk_size=chunk_size):
94+
f.write(data)
95+
down.refresh_down(len(data))
96+
except Exception as e:
97+
print('error : ', str(e))
98+
99+
100+
class DownProgress():
101+
102+
def __init__(self, file_name, file_size):
103+
self.file_name=file_name
104+
self.file_down=0
105+
self.file_size=file_size
106+
107+
def refresh_down(self, down):
108+
self.file_down=self.file_down + down
109+
progress=(self.file_down / float(self.file_size)) * 100
110+
status='下载完成 ' if self.file_down >= self.file_size else "正在下载"
111+
print('文件名称 : {}, 下载进度 : {}, 下载状态 : {}'.format(self.file_name,
112+
'%.2f' % progress, status))
113+
114+
115+
if __name__ == '__main__':
116+
start_time=time.time()
117+
down=VideoDown()
118+
down.load_data()
119+
end_time=time.time()
120+
print("下载共花费时间{}秒".format(end_time - start_time))

0 commit comments

Comments
 (0)