This repository has been archived by the owner on Jan 9, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 196
/
xuetangx.py
220 lines (177 loc) · 8.61 KB
/
xuetangx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/python
# -*- coding: utf-8 -*-
""" 学堂在线课程下载 """
import re
import os
import sys
import json
import requests
from bs4 import BeautifulSoup
# 基本 URL
BASE_URL = 'http://www.xuetangx.com'
# 定义一个全局的会话
CONNECTION = requests.Session()
CONNECTION.headers.update({'User-Agent': 'Mozilla/5.0'})
# 连续两个以上的空白字符正则表达式
REG_SPACES = re.compile(r'\s+')
# Windows 文件名非法字符的正则表达式
REG_FILE = re.compile(r'[\\/:\*\?"<>\|]')
def get_book(url):
""" 获得所有的 PDF 电子书 """
# 含有导航条的页面
print('正在获取电子书……')
nav_page = CONNECTION.get(url).text
shelves = set(re.findall(r'/courses/.+/pdfbook/\d/', nav_page))
for shelf_count, shelf in enumerate(shelves, 1):
res = CONNECTION.get(BASE_URL + shelf).text
soup = BeautifulSoup(res, 'lxml')
save_dir = os.path.join(BASE_DIR, 'Books', str(shelf_count))
if not os.path.exists(save_dir):
os.makedirs(save_dir)
for book_count, book in enumerate(soup.select('#booknav a'), 1):
print('------>', book.string)
file_name = REG_FILE.sub(' ', book.string) + '.pdf'
pdf = CONNECTION.get(BASE_URL + book['rel'][0]).content
with open(os.path.join(save_dir, file_name), 'wb') as pdf_file:
pdf_file.write(pdf)
def get_handout(url):
""" 从课程信息页面获得课程讲义的 HTML 文件 """
res = CONNECTION.get(url).text
soup = BeautifulSoup(res, 'lxml')
handouts = soup.find(class_='handouts')
for link in handouts.select('a[href^="/"]'):
link['href'] = BASE_URL + link['href']
with open(os.path.join(BASE_DIR, 'Handouts.html'), 'w', encoding='utf-8') as handouts_html:
handouts_html.write('<!DOCTYPE html>\n<html>\n<head>\n<title>讲义</title>\n<meta charset="utf-8">\n</head>\n<body>\n%s</body>\n</html>\n' % handouts.prettify())
def get_video(video_id, file_name):
""" 根据视频 ID 和文件名字获取视频信息 """
res = CONNECTION.get('https://xuetangx.com/videoid2source/' + video_id).text
try:
video_url = json.loads(res)['sources']['quality20'][0]
except:
video_url = json.loads(res)['sources']['quality10'][0]
VIDEOS.write(video_url + '\n')
RENAMER.write('REN "' + re.search(r'(\w+-[12]0.mp4)', video_url).group(1) + '" "%s.mp4"\n' % file_name)
def get_content(url):
""" 获取网页详细内容 """
# 获取课件页面(点击进入学习后的页面)
courseware = CONNECTION.get(url).text
soup = BeautifulSoup(courseware, 'lxml')
# 获取所有章的 DOM 节点
chapters = soup.find(id='accordion').find_all(class_='chapter')
for chapter_count, chapter in enumerate(chapters, 1):
# 章的标题
chapter_title = chapter.h3.a.get_text(strip=True)
print('%s' % chapter_title)
OUTLINE.write('%s {%d}\n' % (chapter_title, chapter_count))
# 获取节的信息,包括 URL 等
sections = chapter.select('ul a')
for section_count, section_info in enumerate(sections, 1):
# 节的地址
section_url = BASE_URL + section_info['href']
# 节的标题
section_title = section_info.p.string.strip()
print(' %s' % section_title)
OUTLINE.write(' %s {%d.%d}\n' % (section_title, chapter_count, section_count))
# 每个节的页面
section_page = CONNECTION.get(section_url).text
soup = BeautifulSoup(section_page, 'lxml')
tabs = soup.find(id='sequence-list').find_all('li')
# 视频的编号每一节从 0 开始
video_sec_count = 0
for tab_count, tab_info in enumerate(tabs, 1):
# 每一个 tab(标签)的标题
# title 可能出现换行符和重复,所以用 data-page-title
tab_title = tab_info.a.get('data-page-title')
print(' %s' % tab_title)
OUTLINE.write(' %s {%d.%d.%d}\n' % (tab_title, chapter_count, section_count, tab_count))
# 获取 tab 的序列号
tab_sequence = tab_info.a.get('aria-controls')
# 获取经过编码后的 tab 内容
tab_escape = soup.find(id=tab_sequence).string
tab = BeautifulSoup(tab_escape, 'lxml').div.div
# tab 中的块
blocks = tab.find_all('div', class_='xblock')
for block in blocks:
try:
# 极少数没有 data-type 属性
block_type = block['data-type']
except:
continue
if block_type == 'Problem' or block_type == 'InlineDiscussion' or block_type == 'HTMLModule':
continue
if block_type == 'Video':
video_sec_count += 1
# 替换连续空格或制表符为单个空格
video_name = REG_SPACES.sub(' ', block.h2.string.strip())
OUTLINE.write(' %s {%d.%d.%d}*\n' % (video_name, chapter_count, section_count, video_sec_count))
video_id = block.div['data-ccsource']
# 文件名
file_name = REG_FILE.sub(' ', video_name)
file_name = '%d.%d.%d %s' % (chapter_count, section_count, video_sec_count, file_name)
print('------>', file_name)
get_video(video_id, file_name)
# 可用的字幕
subtitle_available_url = BASE_URL + block.div['data-transcript-available-translations-url']
subtitle_available = CONNECTION.get(subtitle_available_url).json()
base_subtitle_url = BASE_URL + block.div['data-transcript-translation-url'] + '/'
print('$$$', subtitle_available)
if len(subtitle_available) == 1:
multi_subtitle = False
else:
multi_subtitle = True
for subtitle_url in subtitle_available:
if multi_subtitle:
sub_file_name = file_name + '_' + subtitle_url + '.str'
subtitle_url = base_subtitle_url + subtitle_url
print('$$$', subtitle_url)
print('$$$', sub_file_name)
CONNECTION.get(subtitle_url)
subtitle = CONNECTION.get(subtitle_available_url.rstrip('available_translations') + 'download').content
with open(os.path.join(BASE_DIR, sub_file_name), 'wb') as subtitle_file:
subtitle_file.write(subtitle)
def start(url, path='', book=True, cookies={}):
""" 流程控制 """
global BASE_DIR, VIDEOS, RENAMER, OUTLINE
requests.utils.add_dict_to_cookiejar(CONNECTION.cookies, cookies)
status = CONNECTION.get('http://www.xuetangx.com/header_ajax')
if status.json()['login']:
print('验证成功!\n')
else:
print('Cookies 失效,请获取新的 cookies!')
sys.exit(1)
# 课程信息页面
about_page = CONNECTION.get(url).text
soup = BeautifulSoup(about_page, 'lxml')
# 获取课程的标题
course_name = soup.find(id='title1').string
# 获取课程的发布者(一般是大学)
institution = soup.find(class_='courseabout_text').a.string
# 可以用于文件夹名字的标题
dir_name = REG_FILE.sub('', course_name + ' - ' + institution)
print(dir_name)
BASE_DIR = os.path.join(path, dir_name)
# 尝试创建文件夹
try:
os.makedirs(BASE_DIR)
except FileExistsError:
pass
# 课件页面地址
courseware = url.rstrip('about') + 'courseware'
# 课程讲义地址
handout = url.rstrip('about') + 'info'
OUTLINE = open(os.path.join(BASE_DIR, 'Outline.txt'), 'w', encoding='utf-8')
VIDEOS = open(os.path.join(BASE_DIR, 'Videos.txt'), 'w', encoding='utf-8')
RENAMER = open(os.path.join(BASE_DIR, 'Rename.bat'), 'w', encoding='utf-8')
RENAMER.write('CHCP 65001\n\n')
if book:
# 使用 handout 作为入口更快
get_book(handout)
get_handout(handout)
get_content(courseware)
VIDEOS.close()
RENAMER.close()
OUTLINE.close()
if __name__ == '__main__':
# start('http://www.xuetangx.com/courses/course-v1:TsinghuaX+00740043X_2015_T2+sp/about', r'F:\MOOCs', True)
pass